In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Relevant imports

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re

from collections import defaultdict

# Tokenizer imports
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import regexp_tokenize

# NLTK corpus and stemming/lemmatizer imports
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Scikit-learn packages
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, linear_model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Gensim imports
import gensim

In [None]:
data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.sentiment.unique()

In [None]:
data.sentiment.value_counts()

In [None]:
data.dtypes

In [None]:
data.review.duplicated().sum()

In [None]:
data.drop_duplicates(keep = "first", inplace = True)
data.shape

In [None]:
data.isna().sum()

In [None]:
# Convert reviews to lowercase

data.review = data.review.apply(lambda x: str(x).lower())

In [None]:
data.reset_index(inplace = True)

In [None]:
data = data.drop("index", axis = 1)

In [None]:
data

In [None]:
def strip_html(raw_text):
  find_html = re.compile('<.*?>')
  clean_text = re.sub(find_html, '', raw_text)
  return clean_text

In [None]:
data.review = data.review.apply(lambda x: strip_html(x))

In [None]:
data

In [None]:
# Running WhiteSpace tokenizer 
wpTokenizer = WordPunctTokenizer()
data["review_tokenized"] = [wpTokenizer.tokenize(text) for text in data["review"]]

In [None]:
data

In [None]:
# Stopwords removal & WordNet lemmatization 

# Define POS tags 
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['J'] = wordnet.ADJ
tag_map['V'] = wordnet.VERB
tag_map['R'] = wordnet.ADV

In [None]:
for index, text in enumerate(data.review_tokenized):
    if index % 1000 == 0:
        print(index)
#     print("-" * 50)
    word_list = []
    wordnet_lemmatizer = WordNetLemmatizer()
    for word, tag in pos_tag(text):
        if word not in stopwords.words("english") and word.isalpha():
            word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])
            word_list.append(word_processed)
    data.loc[index, "review_tokenized_cleaned"] = str(word_list)

In [None]:
data

In [None]:
data.review_tokenized_cleaned.isna().sum()

In [None]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)

In [None]:
print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)

In [None]:
test_y.value_counts()

In [None]:
train_y.value_counts()

In [None]:
label_enc = LabelEncoder()
train_y = label_enc.fit_transform(train_y)
test_y = label_enc.transform(test_y)

In [None]:
print(np.unique(test_y, return_counts = True))
print(np.unique(train_y, return_counts = True))

In [None]:
tfidf_vect = TfidfVectorizer(max_features = 5000)
tfidf_vect.fit(data.review_tokenized_cleaned)

In [None]:
train_X_tfidf = tfidf_vect.transform(train_X)
test_X_tfidf = tfidf_vect.transform(test_X)

### Modelling Multinomial Naives Bayes

In [None]:
train_X_tfidf_dense = train_X_tfidf.todense()
test_X_tfidf_dense = test_X_tfidf.todense()

In [None]:
nb_model = naive_bayes.GaussianNB()
nb_model.fit(train_X_tfidf_dense, train_y)

In [None]:
preds_nb = nb_model.predict(test_X_tfidf_dense)

In [None]:
preds_nb.shape

In [None]:
accuracy_score(preds_nb, test_y)

In [None]:
confusion_matrix(test_y, preds_nb)

In [None]:
print(classification_report(test_y, preds_nb))

### Support Vector Machine Classifier

Training can take some time, grab a coffee in the meanwhile :)

In [None]:
svm = svm.SVC(C = 1.0, kernel = "linear", degree = 3, gamma = "auto")
svm.fit(train_X_tfidf, train_y)

In [None]:
preds_svm = svm.predict(test_X_tfidf)
print(preds_svm.shape)

In [None]:
accuracy_score(preds_svm, test_y)

In [None]:
print(classification_report(test_y, preds_svm))

### Logistic Regression

In [None]:
log_reg = linear_model.LogisticRegression(solver = "lbfgs")
log_reg.fit(train_X_tfidf, train_y)

In [None]:
preds_log_reg = log_reg.predict(test_X_tfidf)
preds_log_reg.shape

In [None]:
accuracy_score(preds_log_reg, test_y)

In [None]:
print(classification_report(test_y, preds_log_reg))