#  Importing Important Packages

In [None]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re
import spacy
from nltk.corpus import sentiwordnet as swn
from IPython.display import clear_output
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly
plotly.offline.init_notebook_mode (connected = True)
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk import ngrams
# The following code creates a word-document matrix.
from sklearn.feature_extraction.text import CountVectorizer
# Modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Reading Data

In [None]:
data=pd.read_csv('../input/imdb-movie-reviews-dataset/movie_data.csv')

In [None]:
data.shape

In [None]:
data.head(5)

# Making two copies of Reviews to edit

In [None]:
#Edits After Removing Stopwords
Edited_Review = data['review'].copy()

# Having a look at 1st ten reviews in the data

In [None]:
data.head(10)

# Preprocessing Function

In [None]:
# Function to preprocess the Reviews data
def preprocess_Reviews_data(data,name):
    # Proprocessing the data
    data[name]=data[name].str.lower()
    # Code to remove the Hashtags from the text
    data[name]=data[name].apply(lambda x:re.sub(r'\B#\S+','',x))
    # Code to remove the links from the text
    data[name]=data[name].apply(lambda x:re.sub(r"http\S+", "", x))
    # Code to remove the Special characters from the text 
    data[name]=data[name].apply(lambda x:' '.join(re.findall(r'\w+', x)))
    # Code to substitute the multiple spaces with single spaces
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    # Code to remove all the single characters in the text
    data[name]=data[name].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
    # Remove the twitter handlers
    data[name]=data[name].apply(lambda x:re.sub('@[^\s]+','',x))

# Function to tokenize and remove the stopwords    
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen
        
        filtered_sentence = [] 

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 
        
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        
        return filtered_sentence
    # Using "getting(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x


# Lemmatization Function

In [None]:
lemmatizer = WordNetLemmatizer()
def Lemmatization(data,name):
    def getting2(sen):
        
        example = sen
        output_sentence =[]
        word_tokens2 = word_tokenize(example)
        lemmatized_output = [lemmatizer.lemmatize(w) for w in word_tokens2]
        
        # Remove characters which have length less than 2  
        without_single_chr = [word for word in lemmatized_output if len(word) > 2]
        # Remove numbers
        cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]
        
        return cleaned_data_title
    # Using "getting2(sen)" function to append edited sentence to data
    x=[]
    for i in data[name].values:
        x.append(getting2(i))
    data[name]=x

# Converting all the texts back to sentences

In [None]:
def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    # Removing double spaces if created
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [None]:
# Using the Lemmatization function to lemmatize the Reviews data
data['Review_without_stopwords'] = Edited_Review

# Using the preprocessing function to preprocess the hotel data
preprocess_Reviews_data(data,'Review_without_stopwords')
# Using tokenizer and removing the stopwords
rem_stopwords_tokenize(data,'Review_without_stopwords')
# Converting all the texts back to sentences
make_sentences(data,'Review_without_stopwords')


#Edits After Lemmatization
final_Edit = data['Review_without_stopwords'].copy()
data["After_lemmatization"] = final_Edit

Lemmatization(data,'After_lemmatization')
# Converting all the texts back to sentences
make_sentences(data,'After_lemmatization')

# Results of Preprocessing data (Removing stopwords & Lemmatization)

In [None]:
data.head(6)

In [None]:
print("- Old Review -")
print(data['review'][3])
print("\n- New Review -")
print(data['Review_without_stopwords'][3])
print("\n- Last Edit Review -")
print(data['After_lemmatization'][3])

In [None]:
data['reviews_text_new'] = data['After_lemmatization'].copy()

In [None]:
# Replacing Positive -> 1 and Negative -> 0

data.replace({"positive":1,"negative":0},inplace=True)

In [None]:
data[['reviews_text_new','sentiment']].head(5)

# Building a machine learning model

# Bag-of-words and n-grams

# Divide into training and test sets:

# Applying logistic regression

In [None]:
### Changes with respect to the previous code
### 1. Increasing the n-grams from just having 1-gram to (1-gram, 2-gram, 3-gram, and 4-gram)
### 2. Including the stopwords in the bag of words features

bow_counts = CountVectorizer(tokenizer= word_tokenize,
                             ngram_range=(1,3))

bow_data = bow_counts.fit_transform(data.reviews_text_new)

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data,
                                                                    data['sentiment'],
                                                                    test_size = 0.2,
                                                                    random_state = 0)

In [None]:
# Defining and training the model
lr_model_all_new = LogisticRegression(max_iter = 200)
lr_model_all_new.fit(X_train_bow, y_train_bow)

# Predicting the results
test_pred_lr_all = lr_model_all_new.predict(X_test_bow)


## Calculate key performance metrics

# Print a classification report
print(classification_report(y_test_bow,test_pred_lr_all))

In [None]:
from joblib import dump, load 

# save model to file 
dump(lr_model_all_new, filename="Sentiment_Analysis.joblib")

In [None]:
# import a saved joblib model 
loaded_joblib_model = load(filename="Sentiment_Analysis.joblib")

In [None]:
# make an evaluate joblib predictions 
joblib_y_preds = loaded_joblib_model.predict(X_test_bow)

In [None]:
print(classification_report(y_test_bow,joblib_y_preds ))