#  Importing Important Packages

In [None]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re
import spacy
from nltk.corpus import sentiwordnet as swn
from IPython.display import clear_output
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly
plotly.offline.init_notebook_mode (connected = True)
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk import ngrams
# The following code creates a word-document matrix.
from sklearn.feature_extraction.text import CountVectorizer
# Modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Reading Data

In [None]:
data=pd.read_csv('../input/imdb-movie-reviews-dataset/movie_data.csv')

In [None]:
data.shape

In [None]:
data.head(5)

# Having a look at 1st ten reviews in the data

In [None]:
data.head(10)

# Preprocessing Function

In [None]:
data['reviews_text_new'] = data['review'].str.lower()

# removing special character
data['reviews_text_new'] = data['reviews_text_new'].str.replace(r'[^A-Za-z0-9]+', ' ')

In [None]:
# removing stop words
eng_stop_words = stopwords.words('english')
stop_words = set(eng_stop_words)
def stopwords_removal(stop_words, sentence):
    sent =[word for word in nltk.word_tokenize(sentence) if word not in stop_words]
    return (' '.join(map(str, sent)))

data['reviews_text_new'] = data['reviews_text_new'].apply(lambda row: stopwords_removal(stop_words, row))

# Lemmatization Function

In [None]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

data["reviews_text_new"] = data['reviews_text_new'].apply(lambda text: lemmatize_words(text))

# Results of Preprocessing data (Removing stopwords & Lemmatization)

In [None]:
data.head(6)

In [None]:
print("- Old Review -")
print(data['review'][3])
print("\n- Last Edit Review -")
print(data['reviews_text_new'][3])

In [None]:
# Replacing Positive -> 1 and Negative -> 0

data.replace({"positive":1,"negative":0},inplace=True)

In [None]:
data[['reviews_text_new','sentiment']].head(5)

# Building a machine learning model

# Bag-of-words and n-grams

# Divide into training and test sets:

# Applying logistic regression

In [None]:
bow_counts = CountVectorizer(tokenizer= word_tokenize,
                             lowercase=True,
                             ngram_range=(1,1))

bow_data = bow_counts.fit_transform(data.reviews_text_new)

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_data,
                                                                    data['sentiment'],
                                                                    test_size = 0.2,
                                                                    random_state = 0,
                                                                    shuffle=False,
                                                                   stratify=None)

In [None]:
# Defining and training the model
lr_model_all_new = LogisticRegression(max_iter = 200)
lr_model_all_new.fit(X_train_bow, y_train_bow)

# Predicting the results
test_pred_lr_all = lr_model_all_new.predict(X_test_bow)


## Calculate key performance metrics

# Print a classification report
print(classification_report(y_test_bow,test_pred_lr_all))

In [None]:
X_train_senti, X_test_senti, y_train_senti, y_test_senti = train_test_split(data['reviews_text_new'],
                                                                            data['sentiment'],
                                                                            test_size = 0.2,
                                                                            random_state = 0,
                                                                           shuffle=False,
                                                                           stratify=None)

In [None]:
# Create A New DataFrame For Testing And Analysing 

df_test = pd.DataFrame(columns = ['review_test','actual_score', 'lr_score','swn_score'])
df_test['review_test'] = X_test_senti
df_test['actual_score'] = y_test_senti
df_test['lr_score'] = test_pred_lr_all
df_test.head(5)

In [None]:
import nltk
import ssl
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
pos=neg=obj=count=0

postagging = []

for review in X_test_senti:
    list = word_tokenize(review)
    postagging.append(nltk.pos_tag(list))

df_test['pos_tags'] = postagging

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    #Lemmatization
    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

    pos=neg=obj=count=0
    
    ###################################################################################
senti_score = []

for pos_val in df_test['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  #positive score is stored at 2nd position
            neg = neg + score[2]  #negative score is stored at 3rd position
        except:
            continue
    senti_score.append(pos - neg)
    pos=neg=0    
    
df_test['senti_score'] = senti_score

In [None]:
len(df_test)

In [None]:
df_test.head(5)

In [None]:
overall=[]
for i in range(40000,50000,1):
    if df_test['senti_score'][i]>= 0:
        overall.append(1)
    elif df_test['senti_score'][i]< 0:
        overall.append(0)
    
df_test['swn_score']=overall

In [None]:
df_test.head(10)

In [None]:
case1=case2=case3=case4=case5=case6=0
for i in range(40000,50000,1):
    if ((df_test['lr_score'][i] == df_test['swn_score'][i]) and (df_test['swn_score'][i] == df_test['actual_score'][i])):
        case1 = case1+1
    if ((df_test['lr_score'][i] == df_test['swn_score'][i]) and (df_test['swn_score'][i] != df_test['actual_score'][i])):
        case2 = case2+1
    if ((df_test['lr_score'][i] != df_test['swn_score'][i]) and (df_test['lr_score'][i] == df_test['actual_score'][i])):
        case3 = case3+1
    if ((df_test['lr_score'][i] != df_test['swn_score'][i]) and (df_test['swn_score'][i] == df_test['actual_score'][i])):
        case4 = case4+1
    if ((df_test['lr_score'][i] != df_test['swn_score'][i]) and (df_test['actual_score'][i]== 0)):
        case5 = case5+1
    if ((df_test['lr_score'][i] != df_test['swn_score'][i]) and (df_test['actual_score'][i]==1)):
        case6 = case6+1
        
print("case 1",case1) # 58%
print("case 2",case2) # 5%
print("case 3",case3) # 30%
print("case 4",case4) # 5%
print("case 5",case5) # 25%
print("case 6",case6) # 10%


In [None]:
from sklearn.metrics import accuracy_score
lr = accuracy_score(y_test_bow, test_pred_lr_all)
swn = accuracy_score(y_test_bow, overall)
print("lr_accuracy",lr)
print("swn_accuracy",swn)

In [None]:
final_score =[]
for i in range(40000,50000,1):
    if 0.58*(df_test['lr_score'][i] == df_test['swn_score'][i]) or 0.3*(df_test['lr_score'][i] != df_test['swn_score'][i]):
        final_score.append(df_test['lr_score'][i])
    elif 0.05*(df_test['lr_score'][i] != df_test['swn_score'][i]):
        final_score.append(df_test['swn_score'][i])
    elif 0.25*(df_test['lr_score'][i] != df_test['swn_score'][i]):
        final_score.append(0)
    elif 0.1*(df_test['lr_score'][i] != df_test['swn_score'][i]):
        final_score.append(1)
        
df_test['final_score_upt']=final_score

In [None]:
final_upt = accuracy_score(y_test_bow, final_score)
print("final_hyprid_accuracy",final_upt)

In [None]:
final_score1 =[]
for i in range(40000,50000,1):
    if (df_test['lr_score'][i]==1) and (df_test['swn_score'][i]==1):
        final_score1.append(1)
    elif (df_test['lr_score'][i]==0) and (df_test['swn_score'][i]==0):
        final_score1.append(0)
    else :
        final_score1.append(df_test['lr_score'][i])
df_test['final_score_upt']=final_score1

In [None]:
final_upt1 = accuracy_score(y_test_bow, final_score1)
print("final_hyprid_accuracy",final_upt1)

In [None]:
df_train = pd.DataFrame(columns = ['lr_score','swn_score'])
df_train['swn_score'] = overall
df_train['lr_score'] = test_pred_lr_all
df_train.head(4)

In [None]:
X_trainf, X_testf, y_trainf, y_testf = train_test_split(df_train,
                                                                    df_test['actual_score'],
                                                                    test_size = 0.2,
                                                                    random_state = 0,
                                                                    shuffle=False,
                                                                   stratify=None)



In [None]:
hyprid = LogisticRegression(max_iter = 200).fit(X_trainf,y_trainf).predict(X_testf)

In [None]:
acc = accuracy_score(y_testf, hyprid)
print("accuracy",acc)

In [None]:
df_test['lr_score'].value_counts()

In [None]:
df_test['swn_score'].value_counts()

In [None]:
averaged_preds = (df_test['lr_score'] + df_test['swn_score'])//2
acc = accuracy_score(y_test_bow, averaged_preds)
print(acc)

In [None]:
lr_model_all_new.feature_names=bow_counts.get_feature_names()

In [None]:
from joblib import dump, load 

# save model to file 
dump(lr_model_all_new, filename="Sentiment_Analysis_unigram1.joblib")
dump(hyprid, filename="Stacking_Voting.joblib")

In [None]:
# import a saved joblib model 
loaded_model_lr = load(filename="Sentiment_Analysis_unigram1.joblib")
loaded_model_hyprid = load(filename="Stacking_Voting.joblib")

In [None]:
feats = loaded_model_lr.feature_names
feats_len = len(feats)

In [None]:
import string
sent ='the actor was ugly'
sent =sent.lower()
sent = sent.translate(str.maketrans('', '', string.punctuation))
filtered_sentence = [] 
stop_words = set(stopwords.words('english')) 
word_tokens =word_tokenize(sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words ]
listToStr = ' '.join(map(str, filtered_sentence))
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(word_tokenize(text))
    return ([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
lemmatized_output =[]
lemmatized_output = lemmatize_words(listToStr)
    
    

In [None]:
lemmatized_output

In [None]:
df_test = pd.DataFrame(columns = ['review_test'])
df_test['review_test'] = lemmatized_output
df_test.head()

In [None]:
import nltk
import ssl
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

pos=neg=obj=count=0

postagging = []

for review in lemmatized_output:
    list = word_tokenize(review)
    postagging.append(nltk.pos_tag(list))

df_test['pos_tags'] = postagging

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    #Lemmatization
    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    #Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet. 
    #Synset instances are the groupings of synonymous words that express the same concept. 
    #Some of the words have only one Synset and some have several.
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

    pos=neg=obj=count=0
    
    ###################################################################################
senti_score = []

for pos_val in df_test['pos_tags']:
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for score in senti_val:
        try:
            pos = pos + score[1]  #positive score is stored at 2nd position
            neg = neg + score[2]  #negative score is stored at 3rd position
        except:
            continue
    senti_score.append(pos - neg)
    pos=neg=0    
    
df_test['senti_score'] = senti_score

In [None]:
df_test.head()

In [None]:
senti_score_output =senti_score
sum =0
for i in range(0, len(senti_score_output)):
    sum = sum + senti_score_output[i]  
if sum >=0:
    sum =1
else:
    sum=0

In [None]:
print("Sum of all the elements of an array: " + str(sum))

In [None]:
#len(feats)

In [None]:
#len(sent_features)

In [None]:
#joblib_y_preds = loaded_joblib_model.predict([sent_features])

In [None]:
#print(joblib_y_preds)