# 1.6M Twitter Data Preprocessing

In [8]:
import pandas as pd
import numpy as np
import re

pd.options.display.max_colwidth = 200

In [9]:
col_names = ["target", "ids", "date", "flag", "user", "text"]
encode = "ISO-8859-1"
data = pd.read_csv('Data/training.1600000.processed.noemoticon.csv', encoding = encode, names = col_names)
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [10]:
data.shape

(1600000, 6)

In [11]:
data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

### """There is no missing data"""

In [13]:
data['target'].value_counts()

4    800000
0    800000
Name: target, dtype: int64

### """Target has only two values 0 and 4, changing it to 0 = Negative and 4 = Positive"""

In [14]:
data['new_target'] = data['target'].apply(lambda x : 'Negative' if x == 0 else 'Positive')
data.head()

Unnamed: 0,target,ids,date,flag,user,text,new_target
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Negative
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,Negative
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


In [15]:
data['flag'].value_counts()

NO_QUERY    1600000
Name: flag, dtype: int64

In [41]:
data['user'].value_counts()[100000:]

machoo92           3
bonsaistudio       3
gondolin           3
officialjessie     3
SarahFaasse        3
                  ..
SNLindy            1
souulm             1
janekim1001        1
Mar_va_lous        1
themonkeymaster    1
Name: user, Length: 559775, dtype: int64

In [42]:
df = data[['text', 'new_target']].copy()
df.head()

Unnamed: 0,text,new_target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


In [66]:
df['text'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

### Text Preprocessing

In [44]:
import string
punc = list(string.punctuation)
print(punc)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


### Remove Urls

In [111]:
def remove_urls(text):
    text = re.sub(r'https?:\/\/\S*| www\.\S*', '', text, flags=re.MULTILINE)
    return text

In [112]:
sample = "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"


In [113]:
remove_urls(sample)

"@switchfoot  - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

- r The solution is to use Python’s raw string notation for regular expression patterns; backslashes are not handled in any special way in a string literal prefixed with 'r'
- ? Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. https? will match either ‘http’ or ‘https’.
- https?:\/\/ will match any "http://" and "https://" in string
- \S Returns a match where the string DOES NOT contain a white space character
- * Zero or more occurrences

In [106]:
df['text'] = df['text'].apply(remove_urls)
df.head()

Unnamed: 0,text,new_target
0,"@switchfoot - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


### Remove all @Words and all #Words

In [107]:
def remove_sym_words(text):
    text = re.sub(r'@\w*\S| #\w*\S', ' ', text)
    return text

In [109]:
sample = "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. #Country You shoulda got David Carr of Third Day to do it. ;D"
remove_sym_words(sample)

"  http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [110]:
df['text'] = df['text'].apply(remove_sym_words)
df.head()

Unnamed: 0,text,new_target
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


### Identifying all Emojis

In [114]:
import re
tweets_text = df.text.str.cat()
emos = set(re.findall(r" ([xX:;][-']?.) ",tweets_text))
emos_count = []
for emo in emos:
    emos_count.append((tweets_text.count(emo), emo))
sorted(emos_count,reverse=True)

[(85428, '; '),
 (34979, 'x '),
 (32138, 'xt'),
 (29294, ': '),
 (18844, 'xc'),
 (15097, 'xa'),
 (14163, ';3'),
 (13751, 'xx'),
 (10904, 'xp'),
 (9724, 'xi'),
 (7697, ';)'),
 (7447, 'xo'),
 (5789, 'xe'),
 (4440, ':3'),
 (3732, ';.'),
 (3078, 'x.'),
 (2950, ':/'),
 (2778, 'xh'),
 (2687, 'xD'),
 (2676, 'X '),
 (2495, 'xy'),
 (2416, ';-'),
 (2340, ';t'),
 (2215, ';I'),
 (2164, ';s'),
 (2024, ':-'),
 (1936, ":'"),
 (1769, ':p'),
 (1720, ':O'),
 (1712, 'XD'),
 (1707, ':|'),
 (1665, ';D'),
 (1639, ':0'),
 (1513, ';S'),
 (1467, ":'("),
 (1445, ':S'),
 (1426, 'x!'),
 (1393, ';i'),
 (1327, 'xI'),
 (1233, ';m'),
 (1214, ';b'),
 (1190, ':1'),
 (1184, 'x,'),
 (1171, '::'),
 (1157, ';p'),
 (1123, ';-)'),
 (1068, ';a'),
 (1041, ':]'),
 (950, 'xb'),
 (950, ';d'),
 (875, ';P'),
 (865, 'Xx'),
 (800, 'XO'),
 (798, ';L'),
 (791, ';o'),
 (770, ':L'),
 (743, 'xs'),
 (725, ';r'),
 (669, ':o'),
 (623, 'XX'),
 (577, ';O'),
 (566, ';/'),
 (551, 'x-'),
 (548, 'XT'),
 (530, 'xm'),
 (516, ':s'),
 (515, ':-D'),
 (

In [115]:
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweets_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweets_text)))

Happy emoticons: {':-D', ';p', ';P', 'x-D', ':d', ';-D', 'X)', ':p', 'x-)', 'xD', ';d', ';)', 'x)', 'xd', ';-)', ';D', 'XD'}
Sad emoticons: {':|', ":'/", ":'(", ":'|", ':/'}


In [132]:
happy_emo = [':-D', ';p', ';P', 'x-D', ':d', ';-D', 'X)', ':p', 'x-)', 'xD', ';d', ';)', 'x)', 'xd', ';-)', ';D', 'XD']
sad_emo = [':|', ":'/", ":'(", ":'|", ':/']

In [133]:
emo = happy_emo + sad_emo
def remove_emo(text):
    text = [i for i in text.split() if i not in emo ]
    text = ' '.join(text)
    return text

In [134]:
sample = "- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
remove_emo(sample)

"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it."

In [135]:
df['text'] = df['text'].apply(remove_emo)
df.head()

Unnamed: 0,text,new_target
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it.",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


### Remove Accented Text

In [117]:
import unicodedata

def remove_accented(text):
    text = unicodedata.normalize("NFKD", text ).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [122]:
sample = 'remove accented characters from text, e.g. café'
remove_accented(sample)

'remove accented characters from text, e.g. cafe'

In [123]:
df['text'] = df['text'].apply(remove_accented)
df.head()

Unnamed: 0,text,new_target
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


### Expand Contractions

In [228]:
slang = pd.read_csv('slang.csv', index_col = 'Unnamed: 0')
slang.head()

Unnamed: 0,a-town,a.o.e.,a/n,a3,a7x,aa,aaa,aaftddup,aanfctw,aapl,...,zex,zh,zig,zomfg,zomg,zomgzorrz,zoot,zot,zt,zup
a,"atlanta,ga",area of effect,authors note,"anytime, anywhere, anyplace",avenged sevenfold,allahu akbar,"anywhere, any place, any time",as a friend till death do us part,a**h**es are not f**king checking these words,apple computer inc.,...,,,,,,,,,,
b,,,,,,,,,,,...,,,,,,,,,,
c,,,,,,,,,,,...,,,,,,,,,,
d,,,,,,,,,,,...,,,,,,,,,,
e,,,,,,,,,,,...,,,,,,,,,,


### Replacing "I`d", with "I'd"

In [139]:

df['text'] = df['text'].str.replace("`", "'")
df.head()

Unnamed: 0,text,new_target
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it.",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,I dived many times for the ball. Managed to save 50% The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative


### Removing Numbers

In [159]:
def remove_num(text):
    text = re.sub(r'\d*', '', text)
    return text

In [160]:
sample = 'I dived many times for the ball. Managed to save 50% The rest go out of bounds'
remove_num(sample)

'I dived many times for the ball. Managed to save % The rest go out of bounds'

In [161]:
df['text'] = df['text'].apply(remove_num)
df.head()

Unnamed: 0,text,new_target
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it.",Negative
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative
2,I dived many times for the ball. Managed to save % The rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative



### Replace Slang Words

In [231]:
slang_columns = list(slang.columns)

In [232]:
def replace_slang(text):
    text = text.lower()
    mylis = text.split()
    new = []
    for i in mylis:
        if isinstance(i, str):
            if i in slang_columns:
                new.append(slang[i].loc[i[0]])
            else:
                new.append(i)
        else:
            continue
    text = ' '.join(new)
    return text

In [233]:
slang["that's"].loc['t']

'that is'

In [234]:
sample = "aww I can't be bothered with homework soooo fed up &gt;:| ilyt xxxx"
replace_slang(sample)

'aww i cannot be bothered with homework soooo fed up &gt;:| i love you too xxxx'

In [220]:
# for j, i in enumerate(df['text'].iloc[277882:]):
      
#     try:
#         replace_slang(i)
        
#     except TypeError as err:
#         print(j)
    
    

In [235]:
%%timeit
df['new_text'] = df['text'].iloc[:10000].apply(replace_slang)
df.head(15)

44 s ± 5.31 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [236]:
# 308 sec for 10000 rows for 16_00_000 it will take approx 13.68 hours
# have to search for some other options :-)


In [237]:
df.head()

Unnamed: 0,text,new_target,new_text
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it.",Negative,"- awww, that is a bummer. you shoulda got david carr of third day to do it."
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,Negative,is upset that he cannot update his facebook by texting it... and might cry as a result school today also. blah!
2,I dived many times for the ball. Managed to save % The rest go out of bounds,Negative,i dived many times for the ball. managed to save % the rest go out of bounds
3,my whole body feels itchy and like its on fire,Negative,my whole body feels itchy and like its on fire
4,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",Negative,"no, it is not behaving at all. i am mad. why am i here? because i cannot see you all over there."


In [238]:
df.drop('new_text', axis = 1, inplace = True)

### Clean Text Remove Punctuation ,Extra spaces and lowercase

In [240]:
import string

def clean(text):
    
    mylist = [' ' if alpha in list(string.punctuation) else alpha for alpha in text ]
    text = ''.join(mylist)
    text = text.replace('  ', ' ')
    text = text.replace('  ', ' ')
    text = text.lower().strip()
    return text

In [241]:
df['text'] = df['text'].apply(clean)
df.head()

Unnamed: 0,text,new_target
0,awww that s a bummer you shoulda got david carr of third day to do it,Negative
1,is upset that he can t update his facebook by texting it and might cry as a result school today also blah,Negative
2,i dived many times for the ball managed to save the rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,no it s not behaving at all i m mad why am i here because i can t see you all over there,Negative


In [4]:
mylis = ['agreed i saw the failwhale allllll day today', 'i m sooo sad they killed off kutner on house whyyyyyyyy', 'haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff', 'danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset','my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its','poor socks luvvvvv the golden retriever i want one sighhhh','oh did i mention it quot gooooood moooorniiiiiiing quot from germany im back in my cage or better my office','stupid movies we watched mirrors ugggggh stooopeeed rip off',]
for i in mylis:
    print(i)

agreed i saw the failwhale allllll day today
i m sooo sad they killed off kutner on house whyyyyyyyy
haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff
danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset
my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its
poor socks luvvvvv the golden retriever i want one sighhhh
oh did i mention it quot gooooood moooorniiiiiiing quot from germany im back in my cage or better my office
stupid movies we watched mirrors ugggggh stooopeeed rip off


In [283]:
def remove_repeated_characters(text):
    pattern = re.compile(r"(\w{2,})(\w{2,})\2(\w*)")
    substitution_pattern = r"\1\2\3"
    text = re.sub(pattern, substitution_pattern, text)
    return text

In [284]:
# Sample
remove_repeated_characters("agreed i saw the failwhale allllll day today")

'agreed i saw the failwhale allll day today'

In [None]:
"(.)\1{2,}"

In [300]:
def remove_repeated_characters(text):
    pattern = re.compile(r"(.)\1{2,}")
    substitution_pattern = r"\1"                       # r"\1\1" keeps good and moorniing
    text = re.sub(pattern, substitution_pattern, text)
    return text

In [301]:
for i in mylis:
    print(i)
    print(remove_repeated_characters(i))
    print()

agreed i saw the failwhale allllll day today
agreed i saw the failwhale al day today

i m sooo sad they killed off kutner on house whyyyyyyyy
i m so sad they killed off kutner on house why

haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff
haha its so cold in the d and no but you should still go to the show they do some incredible stuff

danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset
danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im so upset

my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its
my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its

poor socks luvvvvv the golden retriever i want one sighhhh
poor socks luv the golden retriever i want one sigh

oh did i mention it quot gooooood moooorniiiiiiing quot from germa

In [302]:
df['text'] = df['text'].apply(remove_repeated_characters)
df.head()

Unnamed: 0,text,new_target
0,aw that s a bummer you shoulda got david carr of third day to do it,Negative
1,is upset that he can t update his facebook by texting it and might cry as a result school today also blah,Negative
2,i dived many times for the ball managed to save the rest go out of bounds,Negative
3,my whole body feels itchy and like its on fire,Negative
4,no it s not behaving at all i m mad why am i here because i can t see you all over there,Negative


### Removing Stopwords

In [305]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [309]:
mystop= list(set(stop + list(string.ascii_letters)))
mystop.sort()
print(mystop)


['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'e', 'each', 'f', 'few', 'for', 'from', 'further', 'g', 'h', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'j', 'just', 'k', 'l', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'n', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only'

In [313]:
def remove_stop_words(text):
    text = [word  for word in text.split() if word not in mystop]
    text = ' '.join(text)
    return text

In [314]:
sample = "aw that s a bummer you shoulda got david carr of third day to do it"
remove_stop_words(sample)

'aw bummer shoulda got david carr third day'

In [315]:
df['text'] = df['text'].apply(remove_stop_words)
df.head()

Unnamed: 0,text,new_target
0,aw bummer shoulda got david carr third day,Negative
1,upset update facebook texting might cry result school today also blah,Negative
2,dived many times ball managed save rest go bounds,Negative
3,whole body feels itchy like fire,Negative
4,behaving mad see,Negative


### Shuffling the DataFrame

In [325]:
df.iloc[799995:800005]

Unnamed: 0,text,new_target
799995,sick spending day laying bed listening,Negative
799996,gmail,Negative
799997,rest peace farrah sad,Negative
799998,sounds like rival flagging ads much though,Negative
799999,resit exams summer wishes worked harder first year uni,Negative
800000,love guys best,Positive
800001,im meeting one besties tonight cant wait girl talk,Positive
800002,thanks twitter add sunisa got meet hin show dc area sweetheart,Positive
800003,sick really cheap hurts much eat real food plus friends make soup,Positive
800004,effect everyone,Positive


In [326]:
df2 = df.copy()

from sklearn.utils import shuffle

df2 = shuffle(df2)
df2.head()

Unnamed: 0,text,new_target
129188,eyes bloodshot enough sleep staring screen much flatmates frying onions every damn day without turning fan,Negative
1106,tried get earlier today work,Negative
1408483,watching eagle eye,Positive
1235348,eminem made part lyric like,Positive
1226627,excellent news crysis pc xbox ps,Positive


### Tokenizing Text

In [26]:
import nltk
from nltk.corpus import wordnet
import re

In [328]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences] 
    return word_tokens

In [329]:
sample = "eyes bloodshot enough sleep staring screen much flatmates frying onions every damn day without turning fan"

tokenize_text(sample)

[['eyes',
  'bloodshot',
  'enough',
  'sleep',
  'staring',
  'screen',
  'much',
  'flatmates',
  'frying',
  'onions',
  'every',
  'damn',
  'day',
  'without',
  'turning',
  'fan']]

### Spelling Correction

### Gingerit

In [331]:
print(mylis)

['agreed i saw the failwhale allllll day today', 'i m sooo sad they killed off kutner on house whyyyyyyyy', 'haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff', 'danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset', 'my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its', 'poor socks luvvvvv the golden retriever i want one sighhhh', 'oh did i mention it quot gooooood moooorniiiiiiing quot from germany im back in my cage or better my office', 'stupid movies we watched mirrors ugggggh stooopeeed rip off']


In [330]:
from gingerit.gingerit import GingerIt

In [332]:
parser = GingerIt()

In [333]:
def spell_correct(text):    
    tweet = parser.parse(text)
    return tweet['result']

In [334]:
for i in mylis:
    print(i)
    print(spell_correct(i))
    print()

agreed i saw the failwhale allllll day today
Agreed, I saw the fail whale all day today

i m sooo sad they killed off kutner on house whyyyyyyyy
I am so sad they killed off Kutner on house whyyyyyyyy

haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff
Ha-ha its so cold in the d and no but you should still go to the show they do some incredible stuff

danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset
Danny, I'm upset that I wasn't here to watch the live chat I was in a car for hours on a trip I'm so upset

my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its
My home town my mammy called all depressed, please explain why a parent lets their yr old child walk alone hello its

poor socks luvvvvv the golden retriever i want one sighhhh
Poor socks luvvvvv the golden retriever I want one sigh

oh did i mention it quot gooooood mooo

### TextBlob spell Corrector

In [15]:
from textblob import TextBlob

In [17]:
def texblob_correct(text):
    new = []
    for j in text.split():
        new.append(str(TextBlob(j).correct()))
    text = " ".join(new)
    return text

In [18]:
for i in mylis:
    print(i)
    print(texblob_correct(i))
    print()

agreed i saw the failwhale allllll day today
agreed i saw the failwhale allllll day today

i m sooo sad they killed off kutner on house whyyyyyyyy
i m soon sad they killed off hunter on house whyyyyyyyy

haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff
hata its so cooooold in the d and no but you should still go to the show they do some incredible stuff

danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset
dandy in upset that i want here to watch the live chat i was in a car for hours on a trip in soon upset

my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its
my home town my mamma called all depressed pus explain y a parent let their or old child walk alone hello its

poor socks luvvvvv the golden retriever i want one sighhhh
poor socks luvvvvv the golden retrieved i want one sighhhh

oh did i mention it quot gooooood moooo

### PySpellchecker

In [19]:
from spellchecker import SpellChecker
spell = SpellChecker()

In [20]:
spell.correction('wrogn')

'wrong'

In [22]:
def pyspell_checker(text):
    new = []
    for j in text.split():
        new.append(spell.correction(j))
    text = " ".join(new)
    return text

In [23]:
for i in mylis:
    print(i)
    print(pyspell_checker(i))
    print()

agreed i saw the failwhale allllll day today
agreed i saw the failwhale allllll day today

i m sooo sad they killed off kutner on house whyyyyyyyy
i m soon sad they killed off outer on house whyyyyyyyy

haha its so cooooold in the d and no but you should still go to the show they do some incredible stuff
aha its so cooooold in the d and no but you should still go to the show they do some incredible stuff

danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soooo upset
danny im upset that i wasnt here to watch the live chat i was in a car for hours on a trip im soon upset

my home town my mammy called all depressd pls explain y a parent let their yr old child walk alone hello its
my home town my mammy called all depressed pls explain y a parent let their yr old child walk alone hello its

poor socks luvvvvv the golden retriever i want one sighhhh
poor socks luvvvvv the golden retriever i want one sighhhh

oh did i mention it quot gooooood moooor

### On comparing Gingerit, Textblob and Pyspellchecker we can see that Gingerit is much better than other two

### Remove Repeated words

In [27]:
def rem_repeated_word(text):
    text = re.sub(r'\b([a-z]+)\s+\1{1,}\b', r"\1", text)
    text = re.sub(r'\b([a-z]+)\s+\1{1,}\b', r"\1", text)
    return text

In [28]:
#Example
rem_repeated_word("i am going going to work from home home home")

'i am going to work from home'

### Remove Html Tags

In [79]:
from bs4 import BeautifulSoup

In [80]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    return text

In [None]:
Thanks