In [0]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
#print(os.listdir("../input"))
import warnings
warnings.filterwarnings('ignore')

In [0]:
#, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, engine='python')
import pandas as pd
data = pd.read_csv("/content/IMDB Dataset.csv")

In [6]:
data.shape

(50000, 2)

In [7]:
#Take a glance
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
#exploratory analysis
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [9]:
#Checking for missing values
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [11]:
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [0]:
train_rev = data['review'][:40000]
train_sent = data['sentiment'][:40000]
test_rev = data['review'][40000:]
test_sent = data['sentiment'][40000:]

In [15]:
(train_rev.shape,train_sent.shape),(test_rev.shape,test_sent.shape)

(((40000,), (40000,)), ((10000,), (10000,)))

In [19]:
#Text Normalization:
import nltk
nltk.download('stopwords')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
#Removing html scripts
def strip_html(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

In [0]:
#Removing the square brackets
def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

In [0]:
#Removing the noisy text
def denoise_text(text):
  text = strip_html(text)
  text = remove_square_brackets(text)
  return text 

In [0]:
data['review'] = data['review'].apply(denoise_text)

In [0]:
#Removing special characters
def remove_special_characters(text, removing_digits=True):
  pattern = r'[^a-zA-Z0-9\s]'
  text = re.sub(pattern,' ',text)
  return text

In [0]:
data['review'] = data['review'].apply(remove_special_characters)

In [0]:
#Text Stemming
def simple_stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

In [0]:
#Alternative: Lemmatization
#resource: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

In [0]:
data['review'] = data['review'].apply(simple_stemmer)

In [40]:
#Removing Stopwords
stop = set(stopwords.words("english"))
print(stop)

{'while', 'needn', 'or', 'ourselves', 'they', 'why', 'them', "couldn't", 'some', 'over', 'been', 'ours', 'any', "aren't", "shan't", 'd', 'him', 'whom', 'having', 'when', 'don', "should've", "don't", 'be', 'no', "shouldn't", 'was', 'up', 'should', 'mightn', 'ain', 'y', 'with', 'has', 'such', 're', 'off', 'who', 'until', "you've", 'yourself', 'these', 'her', 'doesn', "that'll", "didn't", 'theirs', 'through', 'yourselves', 'hadn', 'from', 'out', 'can', "you'll", "she's", 'that', "wouldn't", 'before', 'then', 'by', 'their', 'isn', "mightn't", 's', 'me', 'into', "you'd", 'its', 'are', "wasn't", 'because', 'both', 'if', 'shouldn', 'his', 'had', 'hasn', 'doing', 'and', 'being', 'couldn', 'as', 'mustn', "isn't", 'didn', 'myself', 'nor', 'just', 'how', 'each', 'did', 'won', 'a', 'there', 'below', 'now', 'were', 'above', "weren't", "you're", 'other', 'haven', "won't", 'he', 've', 'have', 'on', "haven't", 'under', 'after', 'own', 'too', 'yours', 'to', 'against', 'our', 'itself', 'herself', 'o', '

In [0]:
def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

In [0]:
data['review'] = data['review'].apply(remove_stopwords)

In [0]:
#Normalized train reviews
norm_train_reviews=data.review[:40000]
norm_train_reviews[0]
#convert dataframe to string
norm_train_string=norm_train_reviews.to_string()
#Spelling correction using Textblob
norm_train_spelling=TextBlob(norm_train_string)
norm_train_spelling.correct()
#Tokenization using Textblob
norm_train_words=norm_train_spelling.words
norm_train_words

In [0]:
#Normalized test reviews
norm_test_reviews=data.review[40000:]
norm_test_reviews[45005]
##convert dataframe to string
norm_test_string=norm_test_reviews.to_string()
#spelling correction using Textblob
norm_test_spelling=TextBlob(norm_test_string)
print(norm_test_spelling.correct())
#Tokenization using Textblob
norm_test_words=norm_test_spelling.words
norm_test_words

In [0]:
#Bag-of-word model
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
cv.get_feature_names()#-toget feature names

In [0]:
#TF-IDF model
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

In [0]:
#Labeling the sentiment text
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(data['sentiment'])
print(sentiment_data.shape)

In [0]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments)
print(test_sentiments)

In [0]:
#Modeling: LogisticRegression
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

In [0]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)


In [0]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)


In [0]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

In [0]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

In [0]:
#training the linear svm
svm=SGDClassifier(loss='hinge',n_iter=500,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

In [0]:
#The same way we will be doing prediction on test data, performing confusion matrix and classification report

In [0]:
#Multinomial Naive-Bayes
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)

In [0]:
#Do same

In [0]:
#Word Cloud of Positive reviews
word cloud for positive review words
plt.figure(figsize=(10,10))
positive_text=norm_train_reviews[1]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
positive_words=WC.generate(positive_text)
plt.imshow(positive_words,interpolation='bilinear')
plt.show

In [0]:
#WordCloud of negative reviews
word cloud for positive review words
plt.figure(figsize=(10,10))
negative_text=norm_train_reviews[8]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
negative_words=WC.generate(negative_text)
plt.imshow(negative_words,interpolation='bilinear')
plt.show