Apply the classification algorithms Naive Bayes, Logistic regression and K-Nearest neighbours on the attached imdb dataset of review texts and review sentiment.

Convert the review text to Bag-of-Word (BOW) model with TF-IDF weights (text preprocessing should be applied first) and predict the review sentiment (positive or negative). Use label encoding to convert the sentiment feature to numerical values. The training/test split for the dataset should be 80/20.

print the accuracy score for each algorithm on the test dataset to find the most accurate model among the three created models.

In [None]:
import pandas as p

ds = p.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')[20000:25000]

In [None]:
# Text preprocessing

import string
from nltk.corpus import stopwords
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ds['sentiment'].replace({'positive':1, 'negative':0}, inplace=True)
def conlow(txt):
    return txt.lower()
ds['review'] = ds['review'].apply(conlow)

def delspec(txt):
    a = ''
    for c in txt:
        if c.isalnum():
            a = a + c
        else:
            a = a + ' '
    return a
ds['review'] = ds['review'].apply(delspec)

stopwords.words('english')
def delsw(txt):
    a = []
    for nsw in txt.split():
        if nsw not in stopwords.words('english'):
            a.append(nsw)
    b = a[:]
    a.clear()
    return b
ds['review'] = ds['review'].apply(delsw)

porstem = PorterStemmer()
sv = []
def stemming(txt):
    for s in txt:
        sv.append(porstem.stem(s))
    r = sv[:]
    sv.clear()
    return r
ds['review'] = ds['review'].apply(stemming)
def jb(li):
    return " ".join(li)   
ds['review'] = ds['review'].apply(jb)

wordlem = WordNetLemmatizer()
def tokenize(str_input): 
    words = re.sub(r"(?u)[^A-Za-z]", " ", str_input).lower().split(" ")
    words = [wordlem.lemmatize(word) for word in words if len(word)>2]
    return words

In [None]:
# Convert the review text to Bag-of-Word (BOW) model with TF-IDF weights

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectors = vectorizer.fit_transform(ds['review'])
X = p.DataFrame(vectors.toarray())

In [None]:
# Spliting the training/test split for the dataset to 80/20

from sklearn.model_selection import train_test_split

y = ds.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Naive Bayes 

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb=mnb.predict(X_test)
print('Accuracy (Multinomial): ')
print(accuracy_score(y_test,y_pred_mnb))

gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb=gnb.predict(X_test)
print('Accuracy (Gaussian): ')
print(accuracy_score(y_test,y_pred_gnb))

# Note: MultinomialNB works best with text data and gives better accuracy than GaussianNB

In [None]:
# Logistic regression

from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]
LRtrainAcc = []
LRtestAcc = []


for param in C:
    clf = linear_model.LogisticRegression(C=param)
    clf.fit(X_train, y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_test)
    LRtrainAcc.append(accuracy_score(y_train, Y_predTrain))
    LRtestAcc.append(accuracy_score(y_test, Y_predTest))

clf = linear_model.LogisticRegression(C=1.0)  
print('Accuracy (Logistic regression): ')
print(clf.fit(X_train, y_train).score(X_test,y_test))
fig, ax1 = plt.subplots(1, 1, figsize=(12,6))
ax1.plot(C, LRtrainAcc, 'ro-', C, LRtestAcc,'bv--')
ax1.legend(['Training Accuracy','Test Accuracy'])
ax1.set_xlabel('C')
ax1.set_xscale('log')
ax1.set_ylabel('Accuracy')

In [None]:
# K-Nearest neighbours

from sklearn.neighbors import KNeighborsClassifier

numNeighbors = [1, 5, 10, 15, 20, 25, 30]
trainAcc = []
testAcc = []

for k in numNeighbors:
    clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    clf.fit(X_train, y_train)
    Y_predTrain = clf.predict(X_train)
    Y_predTest = clf.predict(X_test)
    trainAcc.append(accuracy_score(y_train, Y_predTrain))
    testAcc.append(accuracy_score(y_test, Y_predTest))

clf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
clf.fit(X_train, y_train)

print('Accuracy (K-Nearest neighbors): ')
print(clf.score(X_test,y_test))

plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc,'bv--')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

Conclusion:

Based on the accuracy score for each algorithm, the most accurate model among the three created models is the Logistic regression algorithm model (accuracy = 0.853).