# This notebook is written as simple as possible to target mainly beginners in NLP.
# I am a firm believer of simplicity and low code works, I have explained my codes here, feel free to ask if you don't understand something, happy to help. Explanations are inside comments, so don't overlook them.
# This notebook uses XGBoost for the classification with Stratified k fold cross validation.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

In [None]:
print(df.shape)
print("-----------------------")
print(df['sentiment'].value_counts())

In [None]:
df.isnull().sum()

In [None]:
#map positive to 1 and negative to 0
df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
#create a new column called kfold and fill it with -1
df['kfold'] = -1
#randomize rows of the data
df = df.sample(frac = 1).reset_index(drop = True)
y = df.sentiment.values #labels

In [None]:
#clean text
import re
import string
from nltk.corpus import stopwords
import xgboost as xgb
from nltk.tokenize import word_tokenize

from sklearn import metrics
from sklearn import model_selection

def clean_text(text):
    #lowercase every letter
    text = text.split() #split by all white spaces
    
    #join tokens by single space, this will remove all kinds of weird spaces
    text = " ".join(text)
    #removes all punctuation using regex and string module
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    
    return text

df.loc[:,'review'] = df.review.apply(clean_text)

In [None]:
df.head()

In [None]:
kf = model_selection.StratifiedKFold(n_splits = 5)

for feature, (train, validation) in enumerate(kf.split(X = df, y=y)):
    df.loc[validation, 'kfold'] = feature
    

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
for fold_ in range(5):
        #temporary dataframes for train and test
        train_df = df[df.kfold != fold_].reset_index(drop = True)
        test_df = df[df.kfold == fold_].reset_index(drop = True)
        
        #initialize TF-IDF vectorizer
        vec = TfidfVectorizer(tokenizer = word_tokenize, token_pattern = None)
        
        #fit the count on trainig data review
        vec.fit(train_df.review)
        #transform
        X_train = vec.transform(train_df.review)
        X_test = vec.transform(test_df.review)
        
        #initialize XGBClassifier 
        ##Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
#Beware that XGBoost aggressively consumes memory when training a deep tree.
        model = xgb.XGBClassifier(max_depth = 8, eta = 0.7, objective= 'binary:logistic', n_estimators = 200, 
                                  use_label_encoder=False, eval_metric = 'auc')
        #fit the model
        model.fit(X_train, train_df.sentiment)
        preds = model.predict(X_test)
        
        #calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy: {accuracy}")
        print("")

In [None]:
print(f"The mean accuracy is: {accuracy.mean()}")

86.2% accuracy!

# Accuracy can be increased by setting the eta to a much lower value and then increasing estimators, 90% accuracy can easily be achieved if you are willing to wait, because with only 100 estimators it took 1 hour to train.
## So feel free to play with the parameters.
## Upvote if you like it or fork it, this gives us motivation to produce more notebooks for the community.