In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
np.random.seed(2)

In [4]:
def accuracy_metric(ytrue,ypred):
    score = sum(ypred==ytrue)/len(ypred)
    return score

In [5]:
def clean_text(text,remove_stopwords=False):
    text_new = re.sub('<.*?>','',text) # Removing HTML Tags
    text_new = re.sub("'\w{1}",'',text_new) # Removing Apostrophe and 1 letter after that
    text_new = re.sub('[^a-zA-Z]',' ',text_new) # Removing Everything Except Alphabets
    text_new = text_new.lower() # Making Everything lower case
    text_new = ' '.join(text_new.split())# For removing duplicate whitespaces
    
    # Removing Stopwords
    stopwords_list = stopwords.words('english')
    if remove_stopwords:
        text_new_list = text_new.split()
        text_new_list = [i for i in text_new_list if i not in stopwords_list]
        text_new = ' '.join(text_new_list)
    
        # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text_new_list = text_new.split()
    text_new_list = [lemmatizer.lemmatize(word) for word in text_new_list]
    text_new = ' '.join(text_new_list)
    
    return text_new

In [6]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [7]:
df['review_stop'] = df['review'].apply(lambda x:clean_text(x))
df['review_wostop'] = df['review'].apply(lambda x:clean_text(x,remove_stopwords=True))
df = df[['review','review_stop','review_wostop','sentiment']]

In [8]:
#Label Encoding the output
laben = LabelEncoder()
df['sentiment_one_hot'] = laben.fit_transform(df['sentiment'])

In [9]:
df

Unnamed: 0,review,review_stop,review_wostop,sentiment,sentiment_one_hot
0,One of the other reviewers has mentioned that ...,one of the other reviewer ha mentioned that af...,one reviewer mentioned watching oz episode you...,positive,1
1,A wonderful little production. <br /><br />The...,a wonderful little production the filming tech...,wonderful little production filming technique ...,positive,1
2,I thought this was a wonderful way to spend ti...,i thought this wa a wonderful way to spend tim...,thought wonderful way spend time hot summer we...,positive,1
3,Basically there's a family where a little boy ...,basically there a family where a little boy ja...,basically family little boy jake think zombie ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love in the time of money is a v...,petter mattei love time money visually stunnin...,positive,1
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,i thought this movie did a down right good job...,thought movie right good job creative original...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",bad plot bad dialogue bad acting idiotic direc...,bad plot bad dialogue bad acting idiotic direc...,negative,0
49997,I am a Catholic taught in parochial elementary...,i am a catholic taught in parochial elementary...,catholic taught parochial elementary school nu...,negative,0
49998,I'm going to have to disagree with the previou...,i going to have to disagree with the previous ...,going disagree previous comment side maltin on...,negative,0


Splitting into train and test data.

In [10]:
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

In [11]:
df_train['sentiment'].value_counts()

positive    19951
negative    19945
Name: sentiment, dtype: int64

In [12]:
df_test['sentiment'].value_counts()

negative    5055
positive    5049
Name: sentiment, dtype: int64

The train set seems balanced and the same case is with the test set.

# **Using Tf-Idf Vectorizer without removing stopwords**

Here, we apply TfIdf Vectorizer(ngram=1) to sentences. And then it is used to train the Logistic regression model. 
The matrix is large in dimension. So, we need to delete the training matrices that were being created.

In [13]:
sentence_list = list(df_train['review_stop'].values)
tfidf_vectorizer = TfidfVectorizer()
train_sent = tfidf_vectorizer.fit_transform(sentence_list)

In [14]:
del sentence_list

In [15]:
y_train = df_train['sentiment_one_hot'].values
X_train = train_sent.todense()

In [16]:
X_train.shape

(39896, 86525)

In [17]:
lr = LogisticRegression(random_state=0)
lr_res = lr.fit(X_train,y_train)

In [18]:
del X_train,y_train

In [19]:
sentence_list_test = list(df_test['review_stop'].values)
y_test = df_test['sentiment_one_hot'].values
test_sent = tfidf_vectorizer.transform(sentence_list_test)
X_test = test_sent.todense()
y_pred = lr_res.predict(X_test)

In [20]:
y_pred.shape

(10104,)

In [21]:
y_test.shape

(10104,)

In [22]:
accuracy_metric(y_test,y_pred)

0.8936064924782264

In [23]:
del X_test,y_test

# **Using Tf-Idf Vectorizer with removing stopwords**

Here, the model is trained on text without stopwords to analyze the effect of stopwords.

In [24]:
sentence_list = list(df_train['review_wostop'].values)
tfidf_vectorizer_wo = TfidfVectorizer()
train_sent = tfidf_vectorizer_wo.fit_transform(sentence_list)

In [25]:
del sentence_list

In [26]:
y_train = df_train['sentiment_one_hot'].values
X_train = train_sent.todense()

In [27]:
X_train.shape

(39896, 86412)

In [28]:
lr_wo = LogisticRegression(random_state=0)
lr_res_wo = lr_wo.fit(X_train,y_train)

In [29]:
del X_train,y_train

In [30]:
sentence_list_test = list(df_test['review_wostop'].values)
y_test = df_test['sentiment_one_hot'].values
test_sent = tfidf_vectorizer_wo.transform(sentence_list_test)
X_test = test_sent.todense()

In [31]:
y_pred = lr_res_wo.predict(X_test)

In [32]:
y_pred.shape

(10104,)

In [33]:
accuracy_metric(y_test,y_pred)

0.896575613618369

In [34]:
del X_test,y_test

Without stopwords,the accuracy is still the same. 

# Testing models on sample sentence

In [38]:
def sentiment_of_sentence(sent,model,vectorizer,le):
    sent_new = clean_text(sent,remove_stopwords=True)
    test_sent = vectorizer.transform([sent_new])
    xtest = test_sent.todense()
    pred = model.predict(xtest)
    text = le.inverse_transform(pred)
    return text[0]

In [39]:
def sentiment_of_sentence_with_stopwords(sent,model,vectorizer,le):
    sent_new = clean_text(sent)
    test_sent = vectorizer.transform([sent_new])
    xtest = test_sent.todense()
    pred = model.predict(xtest)
    text = le.inverse_transform(pred)
    return text[0]

In [40]:
# Without Stopwords model
sample_review = "This was the best sci-fi movie I have ever seen in a long time. It was a mix of military/war combat with alien sci-fi and the two mixed perfectly.I have very very few complaints about the movie, and despite some of the goofs listed, I don't believe they were substantial enough to change anyone's opinion about the movie. I could have done without some of the corny lines, but they did not deter me at all from the movie. Soon after watching the movie for the first time, I bought it and have already watched it four or five times. The combat and action are really exhilarating and Eckhart is a bad ass actor. Great movie, worth the watch. What I enjoyed most about the movie was the amazing effects with the aliens and the nonstop, in your face combat. There was a constant blaze of gunfire and explosions, the perfect Guy movie but even my girlfriend found the movie to be enjoyable(in moderation of course and she probably likes Eckhart). Once again, an amazing movie, watch it now you will not regret it."
sentiment_of_sentence(sample_review,lr_res_wo,tfidf_vectorizer_wo,laben)

'positive'

In [41]:
#With Stopwords model
sentiment_of_sentence_with_stopwords(sample_review,lr_res,tfidf_vectorizer,laben)

'positive'

Using Coefficients of Logistic regression model, we can obtain the polarity scores for different words.

In [62]:
#with stopwords coefficient
feats = tfidf_vectorizer.get_feature_names()
vals = lr_res.coef_
vals = vals.T
vals = vals.reshape(-1)

df_dict = {'words':feats,'polarity':vals}
df_pol = pd.DataFrame(df_dict)

In [63]:
df_pol = df_pol.sort_values(by='polarity').reset_index(drop=True)

In [88]:
#without stopwords coefficient
feats_wo = tfidf_vectorizer_wo.get_feature_names()
vals_wo = lr_res_wo.coef_
vals_wo = vals_wo.T
vals_wo = vals_wo.reshape(-1)

df_dict_wo = {'words':feats_wo,'polarity':vals_wo}
df_pol_wo = pd.DataFrame(df_dict_wo)

In [89]:
df_pol_wo = df_pol_wo.sort_values(by='polarity').reset_index(drop=True)

Most Negative 10 words

In [98]:
df_pol_wo.head(10)

Unnamed: 0,words,polarity
0,worst,-10.79051
1,bad,-8.218434
2,waste,-8.115565
3,awful,-7.942224
4,boring,-6.806302
5,poor,-6.485548
6,terrible,-6.445667
7,nothing,-5.738112
8,worse,-5.148191
9,poorly,-5.134969


Most positive 10 words

In [99]:
df_pol_wo.tail(10)

Unnamed: 0,words,polarity
86402,favorite,4.555936
86403,enjoyed,4.600914
86404,loved,4.634686
86405,brilliant,4.702478
86406,amazing,5.325924
86407,wonderful,5.491446
86408,perfect,5.499082
86409,best,5.742026
86410,excellent,7.423399
86411,great,8.292119


For this model(Tfidf + Logistic Regression), removing stopwords had a minimal impact on the accuracy of the model.

In [97]:
# import os
# os.chdir(r'../working')
# from IPython.display import FileLink
# FileLink(r'polarity_wo_stopwords.csv')