### IMDB Review Sentiment Classification

#### Imports

In [7]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 9.7MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import glob

#### Load Postive Data to Dataframe

-  Iterate the pos folder
- Read files one by one to a list
- Create a df from the list
- Add new column with 'postive' value
-  rename columns

In [12]:
pos_path = r'/home/dl/imdb/pos/pos/' # use your path
pos_files = glob.glob(pos_path + "/*.txt")

pos_li = []
for filename in pos_files:
    with open(filename, "r") as f:
       pos_li.append(f.read())

df_pos = pd.DataFrame(pos_li)
df_pos.loc[:, 'newcol'] = 'postive'
df_pos.columns = ['review', 'sentiment']
df_pos.head()

Unnamed: 0,review,sentiment
0,I first saw Rob Roy twelve years ago. With lit...,postive
1,"""Perhaps we can arrange a meet. "" ""Where are y...",postive
2,"In Northeastern of Brazil, the father of the t...",postive
3,"Yes, I loved this movie when I was a kid. When...",postive
4,"Of all the football films I have watched, this...",postive


In [13]:
neg_path = r'/home/dl/imdb/neg/neg/' # use your path
neg_files = glob.glob(neg_path + "/*.txt")

neg_li = []
for filename in neg_files:
    with open(filename, "r") as f:
       neg_li.append(f.read())

df_neg = pd.DataFrame(neg_li)
df_neg.loc[:, 'newcol'] = 'negative'
df_neg.columns = ['review', 'sentiment']
df_neg.head()

Unnamed: 0,review,sentiment
0,Loony Tunes have ventured (at least) twice int...,negative
1,"in this movie, joe pesci slams dunks a basketb...",negative
2,We went to the movie with a group because the ...,negative
3,"I didn't expect a movie as good as ""In The Lin...",negative
4,I gave this a four purely out of its historica...,negative


- create a new df by appending df_pos and df_neg
- Shuffle all the rows, this important the result will be skewed without this
- describe dataset

In [14]:
df = df_pos.append(df_neg)
df = df.sample(frac=1)
df.describe()

Unnamed: 0,review,sentiment
count,25000,25000
unique,24904,2
top,You do realize that you've been watching the E...,postive
freq,3,12500


- Pretty balanced dataset
- no addition preprocessing required to upsample or downsample
- Accuracy/ confusion matrix should be good evaluation metrics for balanced dataset

In [15]:
df['sentiment'].value_counts()

postive     12500
negative    12500
Name: sentiment, dtype: int64

#### Split data
- Split Test and train dataset
- 80/20 split is good to start, try 75/25 if needed

In [16]:
#split the dataset  
#train dataset
train_reviews=df.review[:20000]
train_sentiments=df.sentiment[:20000]
#test dataset
test_reviews=df.review[20000:]
test_sentiments=df.sentiment[20000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(20000,) (20000,)
(5000,) (5000,)


- TokTok is very efficient in tokenizing the text
- Can try other tokenizers

#### Preporcessing

In [17]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [19]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
df['review']=df['review'].apply(denoise_text)

In [20]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

#### Stemming

In [22]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
df['review']=df['review'].apply(simple_stemmer)

#### Remove stop words

In [23]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
df['review']=df['review'].apply(remove_stopwords)

{'do', 'through', 'above', "you'll", 'themselves', 'those', 'just', 'their', 'had', 'yourself', 'has', 'a', 'should', 'off', 'herself', 'ours', "you've", 'who', 'were', 'hadn', 'is', 'her', 'won', 'isn', 'out', 'about', 'more', "you're", 'its', 'at', 'doing', 'because', 'how', 'your', "she's", 'not', 'few', 'yours', "hasn't", 'as', 'i', "shouldn't", 't', 'wouldn', 'yourselves', 'will', 'against', 'it', 'from', 'his', 'both', 'why', 'of', 'which', 'you', "that'll", 'mightn', 'we', 'him', 'be', 'own', 'myself', 'ourselves', 'been', 'to', "haven't", 'other', 'some', 'between', 'if', 'no', 'into', 'this', 'me', 'so', 's', "should've", 'does', "it's", "weren't", 'll', 'during', 'here', 'that', "wouldn't", 'theirs', 'these', 'but', 'having', 'didn', 'an', "wasn't", 'down', 'on', "didn't", 'wasn', 'again', "mustn't", "hadn't", 'aren', 'being', 'once', 'only', "shan't", 'under', "you'd", "mightn't", 'what', 'any', 'they', 'and', 'very', "don't", 'now', 're', "aren't", "won't", 'with', 'y', 'up

In [27]:
norm_train_reviews=df.review[:20000]
norm_test_reviews=df.review[20000:]

### Model

#### Bags of words model

In [28]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (20000, 3417150)
BOW_cv_test: (5000, 3417150)


####  TFIDF Model

In [30]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (20000, 3417150)
Tfidf_test: (5000, 3417150)


#### Transform Labels

In [32]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(df['sentiment'])
print(sentiment_data.shape)

(25000, 1)


In [34]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:20000]
test_sentiments=sentiment_data[20000:]

#### Baseline Logistic regression

In [35]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [36]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 1 ... 1 0 1]
[0 0 1 ... 1 0 1]


In [37]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.7498
lr_tfidf_score : 0.7446


In [38]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.74      0.76      0.75      2478
    Negative       0.76      0.74      0.75      2522

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.75      0.75      0.75      5000

              precision    recall  f1-score   support

    Positive       0.72      0.79      0.75      2478
    Negative       0.77      0.70      0.73      2522

    accuracy                           0.74      5000
   macro avg       0.75      0.75      0.74      5000
weighted avg       0.75      0.74      0.74      5000



In [39]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[1868  654]
 [ 597 1881]]
[[1764  758]
 [ 519 1959]]


#### Stochastic gradient descent

In [41]:
#training the linear svm
svm=SGDClassifier(loss='hinge',random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [42]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [43]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

svm_bow_score : 0.5356
svm_tfidf_score : 0.498


In [44]:
#Classification report for bag of words 
svm_bow_report=classification_report(test_sentiments,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(test_sentiments,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.52      1.00      0.68      2478
    Negative       0.95      0.08      0.15      2522

    accuracy                           0.54      5000
   macro avg       0.73      0.54      0.42      5000
weighted avg       0.73      0.54      0.41      5000

              precision    recall  f1-score   support

    Positive       0.50      1.00      0.66      2478
    Negative       1.00      0.00      0.01      2522

    accuracy                           0.50      5000
   macro avg       0.75      0.50      0.34      5000
weighted avg       0.75      0.50      0.33      5000



In [45]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,svm_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,svm_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[ 212 2310]
 [  12 2466]]
[[  12 2510]
 [   0 2478]]


#### Multinomial Naive Bayes

In [46]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [47]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[0 0 1 ... 1 0 1]
[0 0 1 ... 1 0 1]


In [48]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.7518
mnb_tfidf_score : 0.7498


In [49]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)
#Classification report for tfidf features
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.74      0.76      0.75      2478
    Negative       0.76      0.74      0.75      2522

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.75      0.75      0.75      5000

              precision    recall  f1-score   support

    Positive       0.74      0.77      0.75      2478
    Negative       0.76      0.73      0.75      2522

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.75      0.75      0.75      5000



In [50]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,mnb_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,mnb_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[1874  648]
 [ 593 1885]]
[[1842  680]
 [ 571 1907]]
