In [20]:
import pandas as pd
import re
import string
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
data = pd.read_csv('dirty_dataset.csv')
data.drop(columns=['id'], inplace = True)
data.sample(5)

Unnamed: 0,text
34,ikr!!! u always kno how 2 cheer me up 🥲❤️
69,that movie wuz da bomb!! <i>totally lit</i> 🔥🔥
56,ikr!!! u always kno how 2 cheer me up 🥲❤️
57,wanna go out 2nite?? hmu @ 8 😉💃
7,"Ths is amazin, can't w8 2 see more! visit www...."


In [3]:
data['text'] = data['text'].str.lower()
data.sample(5)

Unnamed: 0,text
83,"lol, u r the best!! 😂 <div>gr8 job!</div>"
66,dis app iz buggin out 😩 gonna uninstall... asap
5,dis app iz buggin out 😩 gonna uninstall... asap
58,"imho, this sux... totally not kewl!! 😑 #fail"
61,omg! i <b>luv</b> this!!! 😂😂 check this out: h...


In [29]:
full_forms = {
    "omg": "oh my god",
    "luv": "love",
    "idk": "I don't know",
    "rn": "right now",
    "w8": "wait",
    "lol": "laugh out loud",
    "u": "you",
    "r": "are",
    "gr8": "great",
    "imho": "in my humble opinion",
    "sux": "sucks",
    "kewl": "cool",
    "2nite": "tonight",
    "hmu": "hit me up",
    "wuz": "was",
    "da": "the",
    "lit": "amazing",
    "smh": "shaking my head",
    "y'all": "you all",
    "tbh": "to be honest",
    "rn": "right now",
    "btw": "by the way",
    "yo": "hey",
    "dat": "that",
    "iz": "is",
    "af": "as hell",
    "rly": "really",
    "thx": "thanks",
    "c": "see",
    "kno": "know",
    "dis": "this",
    "ikr": "I know right",
    "vacay": "vacation",
    "nite": "night",
    "asap": "as soon as possible",
    "prjct": "project",
    "gonna": "going to",
    "wrk": "work",
    "pp": "profile picture"
}
def correct_full(text):
    words = re.findall(r"\b\w+\b|\S", text) 
    corrected_words = [full_forms.get(word, word) for word in words]
    return ' '.join(corrected_words)
data['text'] = data['text'].apply(correct_full)
data.sample(5)

Unnamed: 0,text
30,i know right you always know how 2 cheer me up...
10,hey that profile picture is done as hell visit...
73,i don t know what 2 do soon bored right now
93,i see you got skill check this coolstuff com
99,shaking my head y all are crazy to be honest i...


In [10]:
def remove_html(text):
    pattern=re.compile(r'<[^>]+>')
    return pattern.sub('',text)

data['text']=data['text'].apply(remove_html)
data.sample(5)

Unnamed: 0,text
53,"ths is amazin , can ' t wait 2 see more ! visi..."
25,i see you got skillz 😎 check this : coolstuff ...
49,I don ' t know what 2 do . . . soooo bored rig...
12,"great work on ur project , by the way ! ! 🚀 🚀 ..."
15,"laugh out loud , you are the best ! ! 😂 great ..."


In [12]:
def remove_urls(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub('',text)

data['text']=data['text'].apply(remove_urls)
data.sample(5)

Unnamed: 0,text
32,this app is buggin out 😩 going to uninstall . ...
52,"hey , that app is dope as hell ! ! 🔥 🔥 🔥 visit..."
55,wanna go out tonight ? ? hit me up @ 8 😉 💃
78,this app is buggin out 😩 going to uninstall . ...
43,that movie was the bomb ! ! totally amazing 🔥 🔥


In [13]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emojis
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U0001FB00-\U0001FBFF"  # Symbols for Legacy Computing
                               u"\U0001F004-\U0001F0CF"  # Miscellaneous Symbols and Arrows
                               u"\U0001F10D-\U0001F10F"  # Enclosed Alphanumeric Supplement
                               u"\U0001F200-\U0001F251"  # Enclosed Ideographic Supplement
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


data['text'] = data['text'].apply(remove_emojis)
data.sample(5)

Unnamed: 0,text
59,wanna go out tonight ? ? hit me up @ 8
35,"ths is amazin , can ' t wait 2 see more ! visi..."
92,I don ' t know what 2 do . . . soooo bored rig...
66,this app is buggin out going to uninstall . ....
97,"really enjoyed the party last night , thanks 4..."


In [18]:
data['text'] = data['text'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}]", "", x))
data.sample(5)

Unnamed: 0,text
13,I know right you always know how 2 cheer me...
64,great work on ur project by the way see y...
66,this app is buggin out going to uninstall ...
26,i see you got skillz check this coolstuff com
47,oh my god i love this check this out ht...


In [31]:
def correct_text(text):
    text = correct_spell(text)       # fix spelling
    text = text.lower()              # normalize case
    text = correct_full(text)        # expand slang
    return text

data['text'] = data['text'].apply(correct_text)
data.sample(5)

Unnamed: 0,text
2,in my humble opinion this sucks totally not co...
15,laugh out loud you are the best great job
35,the is amazing can t wait 2 see more visit www...
13,i know right you always know how 2 cheer me up...
56,i know right you always know how 2 cheer me up...
