In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import re
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('reviews_train.csv')
test = pd.read_csv('reviews_test.csv')

In [3]:
train.head()

Unnamed: 0,review,label,file
0,The film attempts to be a mockumentary--shot i...,0,6514_2.txt
1,It does seem like this film is polarizing us. ...,1,7463_10.txt
2,I'm both amused and disgusted by the people wh...,0,5051_1.txt
3,I can't stand most reality shows and this one ...,0,388_1.txt
4,"Also known as ""Water Lilies"" this film tells t...",1,1384_7.txt


In [4]:
train.shape

(25000, 3)

In [5]:
train['label'].unique()

array([0, 1])

In [6]:
test.head()

Unnamed: 0,review,label,file
0,This is possibility the worst and most disappo...,0,10855_1.txt
1,This film is absolute gold. If you haven't see...,1,325_10.txt
2,"Talk about your wild life. Barely a B-movie, b...",0,4476_4.txt
3,"putting aside the ""i'm so sure""s and ""totally ...",1,1816_10.txt
4,I remember really liking BATMAN RETURNS when i...,1,7991_10.txt


In [7]:
corpus_train  = [ ]
corpus_test = [ ]

#  For Train Dataset

In [8]:
for i in range(0, 25000):
    review = re.sub('[^a-zA-Z]+', ' ', train['review'][i])
    review = re.sub('\s+', ' ', review).lower()
    
    stemmer = SnowballStemmer('english')
    review = [stemmer.stem(w) for w in word_tokenize(review) if w not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus_train.append(review)


In [9]:
len(corpus_train)

25000

# For Test Dataset

In [10]:
for i in range(0, 25000):
    review = re.sub('[^a-zA-Z]+', ' ', test['review'][i])
    review = re.sub('\s+', ' ', review).lower()
    
    stemmer = SnowballStemmer('english')
    review = [stemmer.stem(w) for w in word_tokenize(review) if w not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus_test.append(review)


In [11]:
len(corpus_test)

25000

# Bag of Words

In [12]:
cv = CountVectorizer(max_df=0.90, min_df = 2, max_features = 2000)

In [13]:
X = cv.fit_transform(corpus_train).toarray()

In [14]:
X.shape

(25000, 2000)

In [15]:
y = train.iloc[:,1:2].values

In [16]:
y.shape

(25000, 1)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Using Naive Bayes Classifier

In [18]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [19]:
score = f1_score(y_test, y_pred)
score

0.6808193668528865

In [20]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[2708,  463],
       [1251, 1828]])

In [21]:
acc = accuracy_score(y_test, y_pred)
acc

0.72576

# Predicting for the Test Dataset 

In [22]:
x = cv.transform(corpus_test).toarray()

In [23]:
x.shape

(25000, 2000)

In [27]:
y_pred_test = classifier.predict(x)

In [28]:
y_pred_test

array([0, 1, 0, ..., 0, 1, 0])

In [32]:
test.head()

Unnamed: 0,review,label,file
0,This is possibility the worst and most disappo...,0,10855_1.txt
1,This film is absolute gold. If you haven't see...,1,325_10.txt
2,"Talk about your wild life. Barely a B-movie, b...",0,4476_4.txt
3,"putting aside the ""i'm so sure""s and ""totally ...",1,1816_10.txt
4,I remember really liking BATMAN RETURNS when i...,1,7991_10.txt


In [35]:
Y = test.iloc[:, 1:2].values

In [36]:
score_test = f1_score(Y, y_pred_test)
score_test

0.6544990306870301

In [39]:
matrix = confusion_matrix(Y, y_pred_test)
matrix

array([[10772,  1728],
       [ 5579,  6921]])

In [41]:
accuracy = accuracy_score(Y, y_pred_test)
accuracy

0.70772