### Importing important libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

 ### Loading the dataset

In [None]:
df = pd.read_csv('Train.csv')
df.head()

In [None]:
print(df.shape)

In [None]:
df = df.values # Converting dataframe into numpy arrays
X = df[:,0]    # X contains all the reviews 
Y = df[:,1]    # Y contains all the labels associated with reviews
print(X.shape)
print(Y.shape)

In [None]:
# Printing the first review in the dataset
print(X[0])

In [None]:
# Printing label of first review
print(Y[0])

For classifying the reviews as postive or negative, we need to understand the semantics of the review and based upon that we need to train our classifier. So, we have to use nltk (Natural Language Processing Toolkit) to understand the reviews. 

###  NLTK pipeline

#### 1. Tokenization
#### 2. Stopwords removal
#### 3. Stemming/Lemmitization
#### 4. Vocab creation/ Vectorization of each review
#### 5. Classification

In [None]:
# Importing NLTK libraries

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

In [None]:
sample = X[1]
print(sample)

In [None]:
tokenizer = RegexpTokenizer('[a-zA-Z]+') # tokenize all words
en_stopwords = set(stopwords.words('english')) #set of stopwords from english language
lc = LancasterStemmer()

In [None]:
# function to get cleanned reviews

def getCleanReview(review):
    review = review.lower() #convert all the words to lowercase
    review = review.replace("<br /><br />"," ") # Replace all the br tags with spaces
    
    # Tokenizing the reviews
    tokens = tokenizer.tokenize(review)
    # Removing the stopwards
    useful_tokens = [i for i in tokens if i not in en_stopwords]
    # Stemming the tokens
    stemmed_tokens = [lc.stem(i) for i in useful_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [None]:
getCleanReview(X[1])

#### Inputting entire reviews and getting cleaned reviews 

In [None]:
# function to get completely clean documents
def getCleanedDocument(doc):
    cleaned_X = [getCleanReview(i) for i in doc]
    return cleaned_X

In [None]:
cleaned_doc= getCleanedDocument(X)

### Training the model

In [None]:
# Importing libraries 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
cv = CountVectorizer(ngram_range=(1,3))

In [None]:
# converting cleaned_doc into a vector
vec_doc = cv.fit_transform(cleaned_doc)

In [None]:
mnb = MultinomialNB()

In [None]:
# Splitting our train dataset for training the dataset and validation purpose
X_train,X_val,Y_train,Y_val = train_test_split(vec_doc,Y,train_size=0.8,random_state=5)

In [None]:
# Making the model to learn
mnb.fit(X_train,Y_train)

In [None]:
# Checking the accuracy of the model
accuracy = mnb.score(X_val,Y_val)
print("Accuracy on training set is: ",accuracy)

#### Predictions on test dataset using Multinomial Naive Bayes

In [None]:
df1 = pd.read_csv('Test.csv')

In [None]:
df1 = df1.values
X_test = df1[:,0]

In [None]:
Cleanned_X_test = getCleanedDocument(X_test)

In [None]:
X_test_vec = cv.transform(Cleanned_X_test)

In [None]:
# Calculates posterior probability of each review in term of belongingness to a particular class
mnb_post_prob = mnb.predict_proba(X_test_vec)

In [None]:
print(mnb_post_prob[:5,:]) # Displayiing posterior probability of first five test reviews

In [None]:
pred = mnb.predict(X_test_vec)

In [None]:
res = pd.DataFrame(data = pred,columns=['label'])

In [None]:
res.to_csv("mnb_predictions.csv",index=True,index_label="Id")

#### Making predictions using Multivariate Bernoulli Naive Bayes 

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
bnb = BernoulliNB()

In [None]:
bnb.fit(X_train,Y_train)

In [None]:
bnb_acc = bnb.score(X_val,Y_val)
print("Accuracy on training set using Bernoulli Naive Bayes is: ",bnb_acc)

In [None]:
# Calculates posterior probability of each review in term of belongingness to a particular class
bnb_post_prob = bnb.predict_proba(X_test_vec)

In [None]:
print(bnb_post_prob[:5,:]) # Displayiing posterior probability of first five test reviews

In [None]:
bnb_predict = bnb.predict(X_test_vec)

In [None]:
bnb_predictions = pd.DataFrame(data=bnb_predict,columns=['label'])
bnb_predictions.to_csv("bnb_predictions.csv",index=True,index_label="Id")