In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Introduction
The main goal of this notebook is to run a NLP - Sentiment Analysis. The idea is to compare two models capable of differentiating good versus bad opinions on a specific topic. In this case, positive / negative movie reviews.<br>
The first one is a “Naïve Bayes Classifier”, which uses Bayes’ Theorem and conditional probability to classify reviews. The second is a classic “Logistic Regression”, based on the logit function. For further details check the APPENDIX and SKLearn documentation.<br>
In addition, this notebook compares two ways of vectorizing sentences. One is based only on words frequency, building a "bag of words". The other, “Term Frequency – Inverse Document Frequency”, incorporates the idea that too frequent terms (words) that are present on both classifications have “low differentiating power”, therefore receiving less weight. On the other hand, there are terms that appear more frequently on only one of the classifications, thus having a “high differentiating power” and receiving higher weight.

# Libraries and Dataset
## Libraries

In [None]:
# classic libraries
import pandas as pd
import numpy as np

# Charts
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns

# cleaning text 
import nltk
from string import punctuation
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from nltk import tokenize

# vectorizer for model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# performance
from sklearn.metrics import confusion_matrix,accuracy_score



## Dataset
The dataset contains 50k movie reviews labeled as positive (50%) or negative (50%).


In [None]:
reviews=pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
print(reviews.shape)
reviews.head()

In [None]:
# % of positive and negative reviews
rev_sent=reviews.groupby(['sentiment']).sentiment.count().to_frame('Count').reset_index()
plt.pie(rev_sent['Count'],labels=rev_sent['sentiment'],autopct='%1.1f%%',colors=('#e64040','#40a1e6'))
plt.title('Reviews %')
plt.show()

In [None]:
# Adding a column with binary classification positive (1) and negative (0)
reviews['sentiment_bin']=reviews['sentiment'].replace(['positive','negative'],[1,0])

# Cleaning
I'm making only a few changes. <br>
First part:<br>
(i) all words lowercase <br>
(ii) removing "stopwords" (ie. pronouns and prepositions and others) <br>
(iii) removing punctutation<br>

Second part:<br>
(iv) Stemmer - The idea is to reduce the word inflection to its root or origin. For instance, reviewers => review<br>

Check the sentence comparison<br>

In [None]:
# Cleaning text - 1

# stopwords
irrelevant_stuff=nltk.corpus.stopwords.words("english")
# stopwords + punctuation
punct=['br','/><','/>','10','15','20','30','80','.<']
for p in punctuation:
    punct.append(p)
irrelevant_stuff=irrelevant_stuff+punct

# function to split a sentence into a list of words
split_token=tokenize.WordPunctTokenizer()

# changes are: lower case / remove punctuation and stopwords
clean_text=[]
for opinion in reviews['review']:
    clean_list_words=[]
    list_words=split_token.tokenize(opinion)
    for word in list_words:
        if word.lower() not in irrelevant_stuff:
            clean_list_words.append(word.lower())
    clean_text.append(' '.join(clean_list_words))
reviews['clean_text_1']=clean_text
        

In [None]:
# Cleaning text - 2 

# Stemming - reducing word inflections to the root or origin
stemmer=SnowballStemmer("english")

# Stemming words
clean_text=[]
for opinion in reviews['clean_text_1']:
    clean_list_words=[]
    list_words=split_token.tokenize(opinion)
    for word in list_words:
        clean_list_words.append(stemmer.stem(word))
    clean_text.append(' '.join(clean_list_words))
reviews['clean_text_2']=clean_text

In [None]:
# comparing sentences
print(' '.join(reviews['review'][0].split()[0:10]))
print(' '.join(reviews['clean_text_1'][0].split()[0:10]))
print(' '.join(reviews['clean_text_2'][0].split()[0:10]))

# Preparing the data (split train test/ vectorizer)
Before running any actual model, we need first to split the data between training and testing sample. This way it is possible to develop a model and later validate it. SKLearn provides an easy to use function “train_test_split”.
After splitting the dataset, it is time to vectorize the sentences. This step is necessary because we cannot directly process “sentences/words”, but we can access the frequency of “words” in “sentences” for each classification (positive or negative review). The "bag of words" transforms the dataset into a matrix where each row represents a sentence and each column a word. The number in each row x column represents the frequency of a specific “word” in a specific “sentence” (there is an example below).
As for "Term Frequency - Inverse Document Frequency" it inversely weights the words based on their frequency (details are provided in the APPENDIX).

In [None]:
# splitting data for training and testing
train,test,class_train,class_test =train_test_split(reviews['clean_text_2'],reviews['sentiment_bin'],random_state=100,train_size=0.6)

In [None]:
# Transforming reviews into vectors, results in a sparse matrix with words as columns and "sentence ID" in rows
# Bag of words
vectorize=CountVectorizer()
bag_of_words_train=vectorize.fit_transform(train)
bag_of_words_test=vectorize.transform(test)

# Term Frequency * Inverse Document Frequency model (TDIDF)
tfidf=TfidfVectorizer()
tf_train=tfidf.fit_transform(train)
tf_test=tfidf.transform(test)


# Just to have a rough idea what the vectorizer is doing
sparse_matrix=pd.DataFrame.sparse.from_spmatrix(bag_of_words_train,columns=vectorize.get_feature_names())
sparse_matrix



# Models
## Naive Bayes Classifier

In [None]:
# model_1_1 Multinominal Naive Bayes Classifier
NBC=MultinomialNB()
model_NBC=NBC.fit(bag_of_words_train,class_train)
NBC_predict=model_NBC.predict(bag_of_words_test)
acc_NBC=accuracy_score(class_test,NBC_predict)
print(f'The accuary for NBC was {acc_NBC*100:.2f}%')

In [None]:
cm=confusion_matrix(class_test,NBC_predict)
print(cm)
print(f'There are {cm[0,0]} true negatives,{cm[1,0]} false negatives, {cm[1,1]} true positives and {cm[0,1]} false positives,')

In [None]:
# model_1_2 Multinominal Naive Bayes Classifier + tfidf
model_tf=NBC.fit(tf_train,class_train)
tf_predict=model_tf.predict(tf_test)
acc_NBC_tf=accuracy_score(class_test,tf_predict)
print(f'The accuary for NBC with TFIDF was {acc_NBC_tf*100:.2f}%')

In [None]:
cm=confusion_matrix(class_test,tf_predict)
print(cm)
print(f'There are {cm[0,0]} true negatives,{cm[1,0]} false negatives, {cm[1,1]} true positives and {cm[0,1]} false positives,')

## Logistic Regression

In [None]:
# model_2_1 Logistic Regression
reg_log=LogisticRegression(solver="lbfgs")
model=reg_log.fit(bag_of_words_train, class_train)
predict_lr=model.predict(bag_of_words_test)
acc_lr=reg_log.score(bag_of_words_test,class_test)
print(f'The accuary for Log Reg was {acc_lr*100:.2f}%')

In [None]:
cm=confusion_matrix(class_test,predict_lr)
print(cm)
print(f'There are {cm[0,0]} true negatives,{cm[1,0]} false negatives, {cm[1,1]} true positives and {cm[0,1]} false positives,')

In [None]:
# model_2_2 Logistic Regression + tfidf
reg_log=LogisticRegression(solver="lbfgs")
model=reg_log.fit(tf_train, class_train)
predict_lr_tf=model.predict(tf_test)
acc_lr_tf=reg_log.score(tf_test,class_test)
print(f'The accuary for Log Reg with TFIDF was {acc_lr_tf*100:.2f}%')

In [None]:
cm=confusion_matrix(class_test,predict_lr_tf)
print(cm)
print(f'There are {cm[0,0]} true negatives,{cm[1,0]} false negatives, {cm[1,1]} true positives and {cm[0,1]} false positives,')

## Comparison Chart
Two intereting points to be noted:<br>
(i) TF IDF performed slightly better in NBC (almost the same) but significantly better in LR<br>
(ii) In overall, Logistic Regression performed better than NBC

In [None]:
# I got his from matplotlib documentation
labels =['NBC','LR']

vec=[100*acc_NBC,100*acc_lr]
tf=[100*acc_NBC_tf,100*acc_lr_tf]

# label locations
x = np.arange(len(labels)) 
# bar width
width = 0.35  

fig, axs = plt.subplots()
axs.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

m1 =axs.bar(x - width/2, vec, width, label='Vec')
m2 = axs.bar(x + width/2, tf, width, label='TF IDF')

axs.set_ylabel('Accuracy (%)')
axs.set_ylim([40, 100])
axs.set_title('Accuracy by Model and Vector Type')
axs.set_xticks(x)
axs.set_xticklabels(labels) 
axs.legend(loc=8)

def autolabel(ms):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for m in ms:
        height = m.get_height()
        axs.annotate(f'{height:.3f}%',
                    xy=(m.get_x() + m.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
autolabel(m1)
autolabel(m2)

fig.tight_layout()
plt.show()

# Charts
## Word Clouds

In [None]:
# positive reviews
pos_rev=reviews.query("sentiment=='positive'")
pos_all_words=" ".join([text for text in pos_rev['clean_text_2']])
    
pos_rev_word_cloud=WordCloud(width=800, height=500,
                         max_font_size=110,
                         collocations=False).generate(pos_all_words)

# negative reviews
neg_rev=reviews.query("sentiment=='negative'")
neg_all_words=" ".join([text for text in neg_rev['clean_text_2']])   
neg_rev_word_cloud=WordCloud(width=800, height=500,
                         max_font_size=110,
                         collocations=False).generate(neg_all_words)

# Plotting
fig, axs=plt.subplots(1,2,figsize=(20,7))

# Pos
axs[0].imshow(pos_rev_word_cloud,interpolation='bilinear')
axs[0].set_title('Positive Reviews',size=15)
# Neg 
axs[1].imshow(neg_rev_word_cloud,interpolation='bilinear')
axs[1].set_title('Negative Reviews',size=15)

fig.suptitle('Wordcloud by Sentiment',size=20)
plt.show()

## Word Frequency

In [None]:
# Tokenizer and n most frequent words
split_token=tokenize.WhitespaceTokenizer()
n=10

# Positive review
pos_token=split_token.tokenize(pos_all_words)
pos_freq=nltk.FreqDist(pos_token)
pos_df_freq=pd.DataFrame({"Words":list(pos_freq.keys()),"Frequency":list(pos_freq.values())})
pos_df_freq=pos_df_freq.nlargest(n,'Frequency')

# Negative review
neg_token=split_token.tokenize(neg_all_words)
neg_freq=nltk.FreqDist(neg_token)
neg_df_freq=pd.DataFrame({"Words":list(neg_freq.keys()),"Frequency":list(neg_freq.values())})
neg_df_freq=neg_df_freq.nlargest(n,'Frequency')

# Plotting charts
fig, axs=plt.subplots(1,2,figsize=(10,5))

# pos
axs[0].bar(pos_df_freq['Words'],pos_df_freq['Frequency'])
axs[0].set_xticklabels(pos_df_freq['Words'], rotation=45)
axs[0].set_title('Positive Reviews',size=10)

# neg
axs[1].bar(neg_df_freq['Words'],neg_df_freq['Frequency'])
axs[1].set_xticklabels(neg_df_freq['Words'], rotation=45)
axs[1].set_title('Negative Reviews',size=10)


fig.suptitle('Word Frequency by Sentiment',size=15)
plt.show()

# APPENDIX

In [None]:
from PIL import Image
from IPython.display import Image

## Naive Bayes Classifier
ps is positive sentiment<br>
ns is negative sentiment<br>
reviews is the complete sentence<br>
words are a part o review<br>



In [None]:
Image('../input/nlp-equations/NBC.JPG')

## Logistic Regression
p is the probability of the positive review and 1 - p, the negative <br>
β are the coefficient for each X (words/features)<br>
e is the error term<br>

p/(1-p) is also known as odds


In [None]:
Image('../input/nlp-equations/Logistic_Regression.JPG')

## Term Frequency * Inverse Document Frequency


In [None]:
Image('../input/nlp-equations/TFIDF.JPG')