In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

In [None]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

**Text Normalization**

Tokenization

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS

In [None]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from bs4 import BeautifulSoup

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Tokenization of text
tokenizer = ToktokTokenizer()

#Setting stopwords: English
stop_word = nltk.corpus.stopwords.words('english')

In [None]:
def noise_removal(text):
  soup = BeautifulSoup(text,"html.parser")
  text = soup.get_text()
  pattern = '\[[^]]*\]' 
  text = re.sub(pattern,'',text)
  return text

#Apply function on review column
df['review'] = df['review'].apply(noise_removal)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Stemming

In [None]:
def stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

#Applying function on review column
df['review'] = df['review'].apply(stemmer)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


Removing StopWords

In [None]:
def removing_stopwords(text,is_lower_case = False):
  tokenizer = ToktokTokenizer()
  tokens = tokenizer.tokenize(text)
  tokens = [i.strip() for i in tokens]

  if is_lower_case:
    filtoken = [i for i in tokens if tokens not in stop_word]
  else:
    filtoken = [i for i in tokens if i.lower() not in stop_word]
  filtered_text = ' '.join(filtoken)
  return filtered_text

In [None]:
#Apply function on review column
df['review'] = df['review'].apply(removing_stopwords)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


**Train Test Split**

In [None]:
train_review_df = df.review[:30000]
test_review_df = df.review[30000:]

Bag Of Words

In [None]:
#CountVectorize for Bag of Words
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1,3))
#Transform train review
cv_train = cv.fit_transform(train_review_df)
#Transform test review data
cv_test = cv.transform(test_review_df)

print('BOW train:', cv_train.shape)
print('BOw test:', cv_test.shape)

BOW train: (30000, 4954557)
BOw test: (20000, 4954557)


TF-IDF

In [None]:
#TfidfVectorizer for Bag of Words
tf = TfidfVectorizer(min_df = 0, max_df = 1,use_idf = True, ngram_range = (1,3))
#Transform train review
tf_train = tf.fit_transform(train_review_df)
#Transform test review data
tf_test = tf.transform(test_review_df)

print('Tfidf train:', cv_train.shape)
print('Tfidf test:', cv_test.shape)

Tfidf train: (30000, 4954557)
Tfidf test: (20000, 4954557)


Label Encoding

In [None]:
label = LabelBinarizer()
sentiment_df = label.fit_transform(df['sentiment'])

print(sentiment_df.shape)

(50000, 1)


Train Test Split

In [None]:
train_df = df.sentiment[:30000]
test_df = df.sentiment[30000:]

**Training Model**

In [35]:
logistic = LogisticRegression(max_iter = 500,C=1,random_state=42)

In [37]:
#Fitting odel for bag of words
lr_bow = logistic.fit(cv_train,train_df)
print(lr_bow)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [39]:
#prediction
bow_prediction = logistic.predict(cv_test)
print(bow_prediction)

['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


In [42]:
#Accuracy
bow_acc = accuracy_score(test_df,bow_prediction)
print("Bow accuracy:",bow_acc)

Bow accuracy: 0.59215


In [40]:
#fitting model for tfidf feature
lr_tfidf = logistic.fit(tf_train,train_df)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [41]:
#prediction
tfidf_prediction = logistic.predict(tf_test)
print(tfidf_prediction)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [44]:
#Accuracy
tfidf_acc = accuracy_score(test_df,tfidf_prediction)
print("Tf_Idf accuracy:",tfidf_acc)

Tf_Idf accuracy: 0.7426
