In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
np.random.seed(2)

In [None]:
def accuracy_metric(ytrue,ypred):
    score = sum(ypred==ytrue)/len(ypred)
    return score

In [None]:
def clean_text(text,remove_stopwords=False):
    text_new = re.sub('<.*?>','',text) # Removing HTML Tags
    text_new = re.sub("'\w{1}",'',text_new) # Removing Apostrophe and 1 letter after that
    text_new = re.sub('[^a-zA-Z]',' ',text_new) # Removing Everything Except Alphabets
    text_new = text_new.lower() # Making Everything lower case
    text_new = ' '.join(text_new.split())# For removing duplicate whitespaces
    
    # Removing Stopwords
    stopwords_list = stopwords.words('english')
    if remove_stopwords:
        text_new_list = text_new.split()
        text_new_list = [i for i in text_new_list if i not in stopwords_list]
        text_new = ' '.join(text_new_list)
    
        # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text_new_list = text_new.split()
    text_new_list = [lemmatizer.lemmatize(word) for word in text_new_list]
    text_new = ' '.join(text_new_list)
    
    return text_new

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df['review_stop'] = df['review'].apply(lambda x:clean_text(x))
df['review_wostop'] = df['review'].apply(lambda x:clean_text(x,remove_stopwords=True))
df = df[['review','review_stop','review_wostop','sentiment']]

In [None]:
#Label Encoding the output
laben = LabelEncoder()
df['sentiment_one_hot'] = laben.fit_transform(df['sentiment'])

In [None]:
df

Splitting into train and test data.

In [None]:
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

In [None]:
df_train['sentiment'].value_counts()

In [None]:
df_test['sentiment'].value_counts()

The train set seems balanced and the same case is with the test set.

# **Using Tf-Idf Vectorizer without removing stopwords**

Here, we apply TfIdf Vectorizer(ngram=1) to sentences. And then it is used to train the Logistic regression model. 
The matrix is large in dimension. So, we need to delete the training matrices that were being created.

In [None]:
sentence_list = list(df_train['review_stop'].values)
tfidf_vectorizer = TfidfVectorizer()
train_sent = tfidf_vectorizer.fit_transform(sentence_list)

In [None]:
del sentence_list

In [None]:
y_train = df_train['sentiment_one_hot'].values
X_train = train_sent.todense()

In [None]:
X_train.shape

In [None]:
lr = LogisticRegression(random_state=0)
lr_res = lr.fit(X_train,y_train)

In [None]:
del X_train,y_train

In [None]:
sentence_list_test = list(df_test['review_stop'].values)
y_test = df_test['sentiment_one_hot'].values
test_sent = tfidf_vectorizer.transform(sentence_list_test)
X_test = test_sent.todense()
y_pred = lr_res.predict(X_test)

In [None]:
y_pred.shape

In [None]:
y_test.shape

In [None]:
accuracy_metric(y_test,y_pred)

In [None]:
del X_test,y_test

# **Using Tf-Idf Vectorizer with removing stopwords**

Here, the model is trained on text without stopwords to analyze the effect of stopwords.

In [None]:
sentence_list = list(df_train['review_wostop'].values)
tfidf_vectorizer_wo = TfidfVectorizer()
train_sent = tfidf_vectorizer_wo.fit_transform(sentence_list)

In [None]:
del sentence_list

In [None]:
y_train = df_train['sentiment_one_hot'].values
X_train = train_sent.todense()

In [None]:
X_train.shape

In [None]:
lr_wo = LogisticRegression(random_state=0)
lr_res_wo = lr_wo.fit(X_train,y_train)

In [None]:
del X_train,y_train

In [None]:
sentence_list_test = list(df_test['review_wostop'].values)
y_test = df_test['sentiment_one_hot'].values
test_sent = tfidf_vectorizer_wo.transform(sentence_list_test)
X_test = test_sent.todense()

In [None]:
y_pred = lr_res_wo.predict(X_test)

In [None]:
y_pred.shape

In [None]:
accuracy_metric(y_test,y_pred)

In [None]:
del X_test,y_test

Without stopwords,the accuracy is still the same. 

# Testing models on sample sentence

In [None]:
def sentiment_of_sentence(sent,model,vectorizer,le):
    sent_new = clean_text(sent,remove_stopwords=True)
    test_sent = vectorizer.transform([sent_new])
    xtest = test_sent.todense()
    pred = model.predict(xtest)
    text = le.inverse_transform(pred)
    return text[0]

In [None]:
def sentiment_of_sentence_with_stopwords(sent,model,vectorizer,le):
    sent_new = clean_text(sent)
    test_sent = vectorizer.transform([sent_new])
    xtest = test_sent.todense()
    pred = model.predict(xtest)
    text = le.inverse_transform(pred)
    return text[0]

In [None]:
# Without Stopwords model
sample_review = "This was the best sci-fi movie I have ever seen in a long time. It was a mix of military/war combat with alien sci-fi and the two mixed perfectly.I have very very few complaints about the movie, and despite some of the goofs listed, I don't believe they were substantial enough to change anyone's opinion about the movie. I could have done without some of the corny lines, but they did not deter me at all from the movie. Soon after watching the movie for the first time, I bought it and have already watched it four or five times. The combat and action are really exhilarating and Eckhart is a bad ass actor. Great movie, worth the watch. What I enjoyed most about the movie was the amazing effects with the aliens and the nonstop, in your face combat. There was a constant blaze of gunfire and explosions, the perfect Guy movie but even my girlfriend found the movie to be enjoyable(in moderation of course and she probably likes Eckhart). Once again, an amazing movie, watch it now you will not regret it."
sentiment_of_sentence(sample_review,lr_res_wo,tfidf_vectorizer_wo,laben)

In [None]:
#With Stopwords model
sentiment_of_sentence_with_stopwords(sample_review,lr_res,tfidf_vectorizer,laben)

Using Coefficients of Logistic regression model, we can obtain the polarity scores for different words.

In [None]:
#with stopwords coefficient
feats = tfidf_vectorizer.get_feature_names()
vals = lr_res.coef_
vals = vals.T
vals = vals.reshape(-1)

df_dict = {'words':feats,'polarity':vals}
df_pol = pd.DataFrame(df_dict)

In [None]:
df_pol = df_pol.sort_values(by='polarity').reset_index(drop=True)

In [None]:
#without stopwords coefficient
feats_wo = tfidf_vectorizer_wo.get_feature_names()
vals_wo = lr_res_wo.coef_
vals_wo = vals_wo.T
vals_wo = vals_wo.reshape(-1)

df_dict_wo = {'words':feats,'polarity':vals}
df_pol_wo = pd.DataFrame(df_dict_wo)

In [None]:
df_pol_wo = df_pol_wo.sort_values(by='polarity').reset_index(drop=True)

Most Negative 10 words

In [None]:
df_pol_wo.head(10)

Most positive 10 words

In [None]:
df_pol_wo.tail(10)

For this model(Tfidf + Logistic Regression), removing stopwords had a minimal impact on the accuracy of the model.

In [None]:
# import os
# os.chdir(r'../working')
# from IPython.display import FileLink
# FileLink(r'polarity_wo_stopwords.csv')