# Importing necessary libraries

In [14]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score

# Reading the dataset

In [15]:
df= pd.read_csv('IMDB_Dataset.csv')

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Checking if there is any null values in the df

In [17]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

# Converting all words in review column to lower case

In [18]:
df['review']= df['review'].str.lower()

# Tokenization

In [19]:
# word_tokenize() is a function. So, we can use it directly
df['new_review']= df['review'].apply(word_tokenize)

In [20]:
df.head()

Unnamed: 0,review,sentiment,new_review
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. <br /><br />the...,positive,"[a, wonderful, little, production, ., <, br, /..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ..."


# Stop words removal

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
stop_words=set (stopwords.words('english'))

In [25]:
def remove_stopwords(token_list):
    return[ word for word in token_list if word not in stop_words]

In [26]:
# applying the function
df['new_review']= df['new_review'].apply(remove_stopwords)

In [27]:
df.head()

Unnamed: 0,review,sentiment,new_review
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production. <br /><br />the...,positive,"[wonderful, little, production, ., <, br, /, >..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,negative,"[basically, 's, family, little, boy, (, jake, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, time, money, ''..."


# Punctuation Removal

In [28]:
punctuation= string.punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [29]:
def punc_removal(token_list):
    return [word for word in token_list if word not in punctuation]

In [30]:
# applying the funct
df['new_review']= df['new_review'].apply(punc_removal)

# Lemmatization 

In [31]:
lemma= WordNetLemmatizer()

In [32]:
def lemma_review(token_list):
    return[lemma.lemmatize(word) for word in token_list]

In [33]:
df['new_review']= df['new_review'].apply(lemma_review)

In [34]:
df.head()

Unnamed: 0,review,sentiment,new_review
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,a wonderful little production. <br /><br />the...,positive,"[wonderful, little, production, br, br, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,negative,"[basically, 's, family, little, boy, jake, thi..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, time, money, ''..."


# Stemming

In [35]:
stemmer= PorterStemmer()

In [36]:
def stemming(token_lists):
    return[stemmer.stem(word) for word in token_lists]

In [37]:
df['new_review']= df['new_review'].apply(stemming)

In [38]:
df.head()

Unnamed: 0,review,sentiment,new_review
0,one of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, 1, oz, episod, '..."
1,a wonderful little production. <br /><br />the...,positive,"[wonder, littl, product, br, br, film, techniq..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe..."
3,basically there's a family where a little boy ...,negative,"[basic, 's, famili, littl, boy, jake, think, '..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, time, money, ''..."


# Creating a new dataframe

In [39]:
dfs= df[['new_review', 'sentiment']].copy()

In [40]:
dfs.head()

Unnamed: 0,new_review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, '...",positive
1,"[wonder, littl, product, br, br, film, techniq...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, 's, famili, littl, boy, jake, think, '...",negative
4,"[petter, mattei, 's, ``, love, time, money, ''...",positive


# TF-IDF Vectorizer

In [41]:
# initialize the vectorizer
tfidf_vectorizer= TfidfVectorizer()

In [42]:
dfs['reviews']= dfs['new_review'].apply(lambda x: ' '.join(x))

In [43]:
dfs.head()

Unnamed: 0,new_review,sentiment,reviews
0,"[one, review, mention, watch, 1, oz, episod, '...",positive,one review mention watch 1 oz episod 'll hook ...
1,"[wonder, littl, product, br, br, film, techniq...",positive,wonder littl product br br film techniqu unass...
2,"[thought, wonder, way, spend, time, hot, summe...",positive,thought wonder way spend time hot summer weeke...
3,"[basic, 's, famili, littl, boy, jake, think, '...",negative,basic 's famili littl boy jake think 's zombi ...
4,"[petter, mattei, 's, ``, love, time, money, ''...",positive,petter mattei 's `` love time money '' visual ...


In [44]:
# fit and transform the data
tfidf_matrix= tfidf_vectorizer.fit_transform(dfs['reviews'])

In [45]:
# converting the tf-idf matrix to an array
dense_array= tfidf_matrix.toarray()

In [46]:
# extracting the feature names
feature_names= tfidf_vectorizer.get_feature_names_out()

In [47]:
# creating a dataframe
tfidf_df= pd.DataFrame(dense_array, columns= feature_names)

In [48]:
tfidf_df.tail()

Unnamed: 0,00,000,00001,0069,007,00am,01,0126,01pm,02,...,être,ís,ísnt,île,ïn,óli,önsjön,über,überwoman,ünfaith
9994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train test split

In [49]:
x= tfidf_df
y= dfs['sentiment']

X_train, X_test, y_train, y_test= train_test_split(x,y,test_size=0.2, random_state=42)

# Model Training

In [50]:
# initialize the model
model= MultinomialNB()

In [51]:
# training the model
model.fit(X_train, y_train)

# Accuracy

In [52]:
predictions= model.predict(X_test)

In [53]:
accuracy= accuracy_score(y_test, predictions)
print("Accuracy"," : ", accuracy)

Accuracy  :  0.846


In [54]:
report= classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

    negative       0.81      0.89      0.85       960
    positive       0.89      0.81      0.85      1040

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



# User Data Preprocess

In [70]:


def preprocess(review):
    
#   converting the review to a lower case
    review=review.lower()
    
#     tokenize
    token_list= word_tokenize(review)
    
#     stop words removal
    after_stopwords= [word for word in token_list if word not in stop_words]
    
#     punctuation_removal
    after_punct= [ word for word in token_list if word not in punctuation]
    
#     stemming
    after_stemming = [stemmer.stem(word) for word in token_list]
    
#     lemmatization
    after_lemma= [lemma.lemmatize(word) for word in token_list]
    
    
    return after_lemma
        

# Predict sentiment

In [73]:
def predict(review):
    
#     calls the fucntion that we made that preprocesses the user defined review
    processed_review= preprocess(review)
    
#     transform the preprocessed review into the tf-idf representation
#     we donot have to fit the model because the fit is used for training the data

    tfidf_review= tfidf_vectorizer.transform(processed_review)
    
#     we can directly input to the trained model
    prediction= model.predict(tfidf_review)
    
    return prediction[0]
    

# Taking user input

In [75]:
user_review= input("Enter the review whose sentiment is to be extracted : ")
prediction= predict(user_review)
print("The predicted sentiment for ", user_review, " is ", prediction)

Enter the review whose sentiment is to be extracted : It wasn't very good but they tried.
The predicted sentiment for  It wasn't very good but they tried.  is  positive
