## Import libraries & data

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
final_df = pd.read_csv('/content/drive/MyDrive/Self Case studies/CS02 Grammar Error Corrector/data/final_df_20211027.csv')

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
589350,"Even with Kokubo , he ca n't win any medals bu...","Even Kokubo , he ca n't win any medals but I w..."
197627,Traveling Alone,Traveling Alone
237461,My Sundays !,My every sunday !
706167,"While we are eating , we can share and discuss...","While we are eating , we can share and discuss..."
879777,AVATAR,AVATAR


In [None]:
final_df.shape

(1037561, 2)

### Adding length features

In [None]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [None]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
309088,Huumm . . .,Huumm . . .,11,11,4,4
42164,I was told that if the humidity is below 80 % ...,I was said that if the humidity is below 80 % ...,104,104,22,22
769239,I 'm sorry .,I 'm sorry .,12,12,4,4
1023801,Don ` t be shy !,Don ` t be shy !,16,16,6,6
309408,"While jogging along the Sakawa river path , I ...","While jogging along Sakawa river path , I caug...",170,162,37,35


## Preprocessing

### Removing Missing/NA 

In [None]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,1
incorrect,1
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [None]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
222211,,,3,3,1,1


In [None]:
final_df = final_df.dropna().reset_index(drop=True)

In [None]:
final_df.shape

(1037560, 6)

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
354851,It is already time to go to bed .,It is already time to go to bed .,33,33,9,9
456981,I had a bad cold for 2 weeks .,I had a bad cold for 2 weeks .,30,30,9,9
1037274,It is not unusual to keep ( retain ) the space...,It is not unusual to keep the place under the ...,80,115,17,23
120748,"Today , I started lang - 8 .","Today , I started lang - 8 .",28,28,8,8
351195,URL,URL,3,3,1,1


### Keep unique sentence pairs

In [None]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539201


In [None]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539202


In [None]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
289768,I am Japanese .,I am Japanese .,15,15,4,4
495494,Intransitive verb and transitive verb .,Intransitive verb and transitive verb .,39,39,6,6
552991,Besause there are more and more cars .,Besause there are more and more cars .,38,38,8,8
574158,start ! !,start ! !,9,9,3,3
904143,Shooting stars,Shooting stars,14,14,2,2
1023994,India 's Tigers !,India 's Tigers !,17,17,4,4
17865,We went Sagano to a bamboo forest .,We went Sagano to a bamboo forest .,35,35,8,8
555163,Phew !,Phew !,6,6,2,2
81329,"That 's why , I am fall in love with her at fi...","That 's why , I am fall in love with her at fi...",57,57,15,15
199201,I heard there are still earthquakes there . . .,I heard there are still earthquakes there . . .,47,47,10,10


In [None]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [None]:
final_df.shape

(498359, 6)

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
216176,We are waiting you guys to help and we hope we...,we are waiting you guys helps and hope we can ...,64,56,16,13
780441,in order to survive in the world .,in order to survive at the world .,34,34,8,8
1027514,"I think that is a long time , but I am not goo...",I think it is a long time but I am not goot at...,60,56,16,15
484994,"Last Monday , a lot of snow fell in Pohang .","Last Monday , It was a lot of snow falled in P...",44,53,11,13
210825,I 've been learning English for ten years now .,"I 've been learned English for ten years ,",47,42,10,9


### Remove Duplicates

In [None]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 2021


In [None]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
717379,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
1027462,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
802135,: - ),: - (,5,5,3,3
800388,: - ),: - (,5,5,3,3
161743,A : How much did it cost ?,A : How much does is cost ?,26,27,8,8
...,...,...,...,...,...,...
350827,to be continued . . .,to be continue . . .,21,20,6,6
17343,to be continued . . .,to be continue . . .,21,20,6,6
633235,to be continued . . .,to be continue . . .,21,20,6,6
767284,today was a bad day .,today is a bad day .,21,20,6,6


In [None]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [None]:
final_df.shape

(496338, 6)

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
136984,"If you click the address above , you can see a...","If you click the adress above , you can see on...",87,95,19,21
457701,I always share my happiness and sorrow with he...,I always share my happiness and sorrow with he...,79,81,16,17
236326,The only thing I could do was just readyour en...,The only thing I could do was just reading you...,53,57,11,12
441145,I should be born 10 years later .,I should be born after 10 years .,33,33,8,8
206214,I will think wrote about out my answers to the...,I will think of what I will wrote about the qu...,89,86,18,18


### Remove Small sentences



In [None]:
final_df[final_df['incorrect_char_count']<2].shape

(5, 6)

In [None]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [None]:
final_df.shape

(496326, 6)

In [None]:
final_df[final_df['correct_char_count']<2].shape

(27, 6)

In [None]:
final_df[final_df['correct_char_count']<2].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
9786,.,daily life .,1,12,1,3
6073,-,- -,1,3,1,2
287866,.,o .,1,3,1,2
101957,),lol ),1,5,1,2
113360,.,M .,1,3,1,2
461049,.,had them .,1,10,1,3
439410,.,on face .,1,9,1,3
80604,.,to near park .,1,14,1,4
42876,.,At first .,1,10,1,3
34956,",","Maiko Nakai ,",1,13,1,3


In [None]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [None]:
final_df.shape

(496287, 6)

### Clean text

In [None]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [None]:
# https://stackoverflow.com/a/47091490/4084039
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [None]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

  0%|          | 0/496287 [00:00<?, ?it/s]

  0%|          | 0/496287 [00:00<?, ?it/s]

In [None]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/496287 [00:00<?, ?it/s]

  0%|          | 0/496287 [00:00<?, ?it/s]

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
347971,I am curious .,I am qurious .,14,14,4,4
299394,with a light sweep it is done .,with a light sweep It is done .,31,31,8,8
449458,"After checking the blackboard , I found out th...","After checking the black board , I found out t...",110,126,22,26
379481,I am considering this surgery .,I do mind this surgery .,31,24,6,6
82752,She had been trying to apologize all day,She had been trying to apologize for all day,40,44,8,9


In [None]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [None]:
# final_df.to_csv('/content/drive/MyDrive/Self Case studies/CS02 Grammar Error Corrector/data/final_df_preprocessed_2021111201.csv',index=False)