Apply the classification algorithms Naive Bayes, Logistic regression and K-Nearest neighbours on the attached imdb dataset of review texts and review sentiment.

Convert the review text to Bag-of-Word (BOW) model with TF-IDF weights (text preprocessing should be applied first) and predict the review sentiment (positive or negative). Use label encoding to convert the sentiment feature to numerical values. The training/test split for the dataset should be 80/20.

print the accuracy score for each algorithm on the test dataset to find the most accurate model among the three created models.

In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import re
import string
nltk.download('wordnet')

In [None]:
imdb_df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
print(imdb_df.shape)
imdb_df.head()

### Sampling data ML models will take a long time

In [None]:
imdb_df = imdb_df.sample(frac=0.1, random_state=1)
imdb_df.shape

### Applying noise removal to the review column

In [None]:
punct = "\n\r"+string.punctuation

def noise_removal(value):
    return value.translate(str.maketrans('', '', punct))

Before noise removal:

In [None]:
imdb_df['review'].head()

After noise removal:

In [None]:
imdb_df['review'].head().apply(noise_removal)

In [None]:
imdb_df['review'] = imdb_df['review'].apply(noise_removal)

### Stop words removal on the review column & TF-IDF Term Weighting

In [None]:
def tokenize(str_input):
    words = re.sub(r"(?u)[^A-Za-z]", " ", str_input).lower().split(" ")
    words = [stemmer.stem(word) for word in words if len(word)>2]
    words = [wordnet_lemmatizer.lemmatize(word) for word in words if len(word)>2]
    return words

In [None]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

vectors = vectorizer.fit_transform(imdb_df['review'])

feature_names = vectorizer.get_feature_names()

In [None]:
print(feature_names[:100])

In [None]:
print("number of words = ", len(feature_names))

In [None]:
imdb_tfidf = pd.DataFrame(vectors.toarray(),columns=feature_names)
imdb_tfidf.head()

In [None]:
imdb_tfidf.shape

### Converting sentiment column to numerical value

In [None]:
imdb_df['sentiment'].value_counts()

In [None]:
def sentiment_to_numerical(value):
    return 1 if value == "positive" else 0

In [None]:
sentiment = imdb_df['sentiment'].apply(sentiment_to_numerical)
sentiment.head()

In [None]:
print(sentiment.shape)
sentiment.value_counts()

### Multinomial Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
x_train, x_test, y_train, y_test = train_test_split(imdb_tfidf, sentiment, test_size=0.2, random_state=0)

print("size of dataset: ",imdb_tfidf.shape)
print("size of training dataset: ",x_train.shape)
print("size of test dataset: ",x_test.shape)

In [None]:
clf = MultinomialNB()
bayes_clf = clf.fit(x_train, y_train)

In [None]:
y_pred = bayes_clf.predict(x_test)
print("Accuracy = ",bayes_clf.score(x_test,y_test))

### Logistic Regression

In [None]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]
LRtrainAcc = []
LRtestAcc = []


for param in C:
    clf = linear_model.LogisticRegression(C=param)
    clf.fit(x_train, y_train)
    y_pred_train = clf.predict(x_train)
    y_pred_test = clf.predict(x_test)
    LRtrainAcc.append(accuracy_score(y_train, y_pred_train))
    LRtestAcc.append(accuracy_score(y_test, y_pred_test))

clf = linear_model.LogisticRegression(C=1.0)  
print("Accuracy= ",clf.fit(x_train, y_train).score(x_test,y_test))
fig, ax1 = plt.subplots(1, 1, figsize=(12,6))
ax1.plot(C, LRtrainAcc, 'ro-', C, LRtestAcc,'bv--')
ax1.legend(['Training Accuracy','Test Accuracy'])
ax1.set_xlabel('C')
ax1.set_xscale('log')
ax1.set_ylabel('Accuracy')

### K-Nearest neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
clf.fit(x_train, y_train)
Y_predTrain = clf.predict(x_train)
Y_predTest = clf.predict(x_test)
print(accuracy_score(y_train, Y_predTrain))
print(accuracy_score(y_test, Y_predTest))
print("accuracy= ", clf.score(x_test,y_test))

# Final Results

- Multinomial Naive Bayes: 81.3 %
- Logistic regression: 85%
- K-Nearest neighbor classifier: 70%