In [None]:
import pandas as pd
import string 


from symspellpy import SymSpell

In [2]:
df = pd.read_csv('../artifacts/sentiment_Dataset.csv')

In [3]:
df1 = df.copy()

In [4]:
df1

Unnamed: 0,review,sentiment
0,"Starts really well, nice intro and build up fo...",negative
1,"Terrific movie: If you did not watch yet, you ...",positive
2,I've seen hundreds of silent movies. Some will...,positive
3,i had been looking for this film for so long b...,positive
4,"Good: Engaging cinematic firefights, great pre...",positive
...,...,...
9995,I almost made a fool of myself when I was goin...,negative
9996,I feel it is my duty as a lover of horror film...,negative
9997,Why was this film made? What were the creators...,negative
9998,If it is true that sadomasochism is a two-side...,positive


# Handle the Duplicate values 

In [5]:
df1.duplicated().sum()

12

In [6]:
df1.drop_duplicates(inplace = True)

In [7]:
df1.duplicated().sum()

0

# LowCase conversion 

In [8]:
df1['review'] = df1['review'].str.lower()

In [9]:
df1.head()

Unnamed: 0,review,sentiment
0,"starts really well, nice intro and build up fo...",negative
1,"terrific movie: if you did not watch yet, you ...",positive
2,i've seen hundreds of silent movies. some will...,positive
3,i had been looking for this film for so long b...,positive
4,"good: engaging cinematic firefights, great pre...",positive


# spelling check

In [25]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

sym_spell.load_dictionary('../artifacts/frequency_dictionary_en_82_765.txt', term_index=0, count_index=1)

True

In [11]:
def correct_spelling(text):
    corrected = sym_spell.lookup_compound(text, max_edit_distance=2)
    return corrected[0].term if corrected else text

In [12]:
df1['review'] = df1['review'].apply(correct_spelling)

In [13]:
df1.head()

Unnamed: 0,review,sentiment
0,starts really well nice intro and build up for...,negative
1,terrific movie if you did not watch yet you mu...,positive
2,i've seen hundreds of silent movies some will ...,positive
3,i had been looking for this film for so long b...,positive
4,good engaging cinematic firefights great prese...,positive


# punctual removal 

In [14]:
df1['review'] = df1['review'].str.translate(str.maketrans('', '', string.punctuation+'\n'))

# remove the stopWords

In [15]:
with open('../artifacts/stop.txt', 'r', encoding = 'utf-8') as file:
    stopWords = file.readlines()

stopWords = [word.replace('\n', '') for word in stopWords]

df1['review'] = df1['review'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopWords]))

In [16]:
df1

Unnamed: 0,review,sentiment
0,starts nice intro build main characters minute...,negative
1,terrific movie watch watch gena davis samuel j...,positive
2,ive hundreds silent movies classics nosferatu ...,positive
3,film long found younger loved viewing loved lo...,positive
4,good engaging cinematic firefights great prese...,positive
...,...,...
9995,made fool start review movie reminded billy el...,negative
9996,feel duty lover horror films warm people horri...,negative
9997,film made creators thinking hmm film plot made...,negative
9998,true sadomasochism sided coin diverse expressi...,positive


# lemmatization 

In [None]:
# nlp = spacy.load('en_core_web_sm')

In [18]:
# df1['review'] = df1['review'].apply(lambda token: ' '.join([token.lemma_ for token in nlp(token)]))

# normalize text

In [19]:
import contractions
import re 
from  unidecode import unidecode  

In [20]:
def normalize_text(text):
    text = contractions.fix(text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    text = unidecode(text)  
    return text

In [21]:
df1['review'] = df1['review'].apply(normalize_text)

In [22]:
df1

Unnamed: 0,review,sentiment
0,starts nice intro build main characters minute...,negative
1,terrific movie watch watch gena davis samuel j...,positive
2,i have hundreds silent movies classics nosfera...,positive
3,film long found younger loved viewing loved lo...,positive
4,good engaging cinematic firefights great prese...,positive
...,...,...
9995,made fool start review movie reminded billy el...,negative
9996,feel duty lover horror films warm people horri...,negative
9997,film made creators thinking hmm film plot made...,negative
9998,true sadomasochism sided coin diverse expressi...,positive


In [24]:
df1.to_csv('../artifacts/cleanDataset.csv', index = False)