# Import Libraries 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import string
string.punctuation
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import Dataset

In [None]:
train_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
counts = train_df.label.value_counts()
sns.barplot(x=counts.index, y=counts)
plt.xlabel('Label')
plt.ylabel('Count')

In [None]:
def text_cleaning(message):
    punc_removed = [char for char in message if char not in string.punctuation]
    punc_removed_join = ''.join(punc_removed)
    punc_removed_join_clean = [ word for word in punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    punc_removed_join_clean = str(punc_removed_join_clean).lower()
    return punc_removed_join_clean 

In [None]:
X_train_df = train_df['text'].apply(text_cleaning)
y_train_df = train_df['label'].values

In [None]:
vectorizer = CountVectorizer()
data_vectorizer = vectorizer.fit_transform(X_train_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_vectorizer, y_train_df, test_size=0.2)

# MultinomialNB

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

In [None]:
y_pred_MNB = MNB.predict(X_test)

In [None]:
Acc_MNB = MNB.score(X_train, y_train)
acc_MNB = MNB.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_MNB*100))
print ('Test Accuracy : {:.2f}%'.format(acc_MNB*100))

In [None]:
print(classification_report(y_test,y_pred_MNB))

# Random Forest Classifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc = rfc.predict(X_test)

In [None]:
Acc_rfc = rfc.score(X_train, y_train)
acc_rfc = rfc.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_rfc*100))
print ('Test Accuracy : {:.2f}%'.format(acc_rfc*100))

In [None]:
print(classification_report(y_test,y_pred_rfc))

# Linear SVC

In [None]:
Lsvc =LinearSVC()
Lsvc.fit(X_train, y_train)

In [None]:
Acc_Lsvc = Lsvc.score(X_train, y_train)
acc_Lsvc = Lsvc.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_Lsvc*100))
print ('Test Accuracy : {:.2f}%'.format(acc_Lsvc*100))

In [None]:
y_pred_Lsvc = Lsvc.predict(X_test)
print(classification_report(y_test, y_pred_Lsvc))

# Final Report

In [None]:
output = pd.DataFrame({"Model":['MultinomialNB','Random Forest Classifier','Linear SVC'],
                      "Accuracy":[acc_MNB, acc_rfc, acc_Lsvc]})
output

In [None]:
sns.barplot(x='Accuracy', y='Model', data=output)