In [27]:
import pandas as pd
import nltk, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
imdb_df = pd.read_csv('dataset/imdb_labelled.txt', sep='\t', header=None)

In [3]:
imdb_df.shape

(1000, 2)

In [4]:
yelp_df = pd.read_csv('dataset/yelp_labelled.txt', sep='\t', header=None)

In [5]:
yelp_df.shape

(1000, 2)

In [6]:
amazon_df = pd.read_csv('dataset/amazon_cells_labelled.txt', sep='\t', header=None)

In [7]:
amazon_df.shape

(1000, 2)

In [9]:
df = imdb_df.append(yelp_df).append(amazon_df)

  df = imdb_df.append(yelp_df).append(amazon_df)


In [10]:
df.shape

(3000, 2)

In [11]:
df.head()

Unnamed: 0,0,1
0,A very very very slowmoving aimless movie abou...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [12]:
df.columns = ['Review', 'Sentiment']

In [13]:
df.head()

Unnamed: 0,Review,Sentiment
0,A very very very slowmoving aimless movie abou...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [17]:
def textProcessing(documents):
    table = str.maketrans('','',string.punctuation)
    for i in range(len(documents)):
        documents[i] = documents[i].translate(table).lower()
        
    tokens = []
    for i in range(len(documents)):
        tokens.append(word_tokenize(documents[i]))
        
    eng_stopwords = stopwords.words("english")

    wordsList = []
    for tokenList in tokens:
        words = []
        for token in tokenList:
            if token not in eng_stopwords:
                words.append(token)
        wordsList.append(words)
        
    wnet = WordNetLemmatizer()
    for i in range(len(wordsList)):
        for j in range(len(wordsList[i])):
            wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')
            
    final_list = []
    for i in range(len(wordsList)):
        final_list.append(" ".join(wordsList[i]))
        
    return final_list

In [18]:
finalList = textProcessing(df['Review'].values)

In [19]:
finalList[0]

'slowmoving aimless movie distress drift young man'

In [20]:
finalList[-1]

'answer call unit never work'

In [21]:
len(finalList)

3000

In [22]:
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(finalList).toarray()

In [23]:
vector.shape

(3000, 4532)

In [24]:
x_train, x_test, y_train, y_test = train_test_split(vector, df['Sentiment'], test_size=0.25)

In [25]:
x_train.shape

(2250, 4532)

In [26]:
y_train.shape

(2250,)

In [28]:
# BernoulliNB - 0s and 1s, if features are in binary form
# MultinomialNB - discrete data, word frequency is discrete in text data
# GaussianNB - Normal Distribution, features are continuous

In [29]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(x_train, y_train)

GaussianNB()

In [30]:
y_pred = gaussian_nb.predict(x_test)
print("Accuracy Using Gaussian NB ::",accuracy_score(y_pred, y_test))

Accuracy Using Gaussian NB :: 0.664


In [31]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(x_train, y_train)

MultinomialNB()

In [32]:
y_pred = multinomial_nb.predict(x_test)
print("Accuracy Using Multinomial NB ::",accuracy_score(y_pred, y_test))

Accuracy Using Multinomial NB :: 0.808


In [33]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression()

In [34]:
y_pred = logistic.predict(x_test)
print("Accuracy Using Logistic Regression ::",accuracy_score(y_pred, y_test))

Accuracy Using Logistic Regression :: 0.8026666666666666


In [35]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [36]:
y_pred = knn.predict(x_test)
print("Accuracy Using KNN ::",accuracy_score(y_pred, y_test))

Accuracy Using KNN :: 0.7386666666666667


In [37]:
y_pred = multinomial_nb.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.81       374
           1       0.81      0.81      0.81       376

    accuracy                           0.81       750
   macro avg       0.81      0.81      0.81       750
weighted avg       0.81      0.81      0.81       750



In [38]:
from sklearn.metrics import confusion_matrix

In [39]:
confusion_matrix(y_pred, y_test)

array([[301,  71],
       [ 73, 305]], dtype=int64)

In [40]:
y_test.shape

(750,)