In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [2]:
#nltk.download("stopwords")

In [3]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [4]:
dataset

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [5]:
# Apply first level cleaning
import re
import string

#This function converts to lower-case, removes square bracket, removes numbers and punctuation
def text_clean_1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

cleaned1 = lambda x: text_clean_1(x)

In [6]:
# Let's take a look at the updated text
dataset['cleaned_comments'] = pd.DataFrame(dataset.Comments.apply(cleaned1))
dataset.head(10)

Unnamed: 0,Reviews,Comments,cleaned_comments
0,1,The Da Vinci Code book is just awesome.,the da vinci code book is just awesome
1,1,this was the first clive cussler i've ever rea...,this was the first clive cussler ive ever read...
2,1,i liked the Da Vinci Code a lot.,i liked the da vinci code a lot
3,1,i liked the Da Vinci Code a lot.,i liked the da vinci code a lot
4,1,I liked the Da Vinci Code but it ultimatly did...,i liked the da vinci code but it ultimatly did...
5,1,that's not even an exaggeration ) and at midni...,thats not even an exaggeration and at midnigh...
6,1,"I loved the Da Vinci Code, but now I want some...",i loved the da vinci code but now i want somet...
7,1,"i thought da vinci code was great, same with k...",i thought da vinci code was great same with ki...
8,1,The Da Vinci Code is actually a good movie...,the da vinci code is actually a good movie
9,1,I thought the Da Vinci Code was a pretty good ...,i thought the da vinci code was a pretty good ...


In [7]:
# Apply a second round of cleaning
def text_clean_2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

cleaned2 = lambda x: text_clean_2(x)

In [8]:
# Let's take a look at the updated text
dataset['cleaned_comments_new'] = pd.DataFrame(dataset['cleaned_comments'].apply(cleaned2))
dataset.head(10)

Unnamed: 0,Reviews,Comments,cleaned_comments,cleaned_comments_new
0,1,The Da Vinci Code book is just awesome.,the da vinci code book is just awesome,the da vinci code book is just awesome
1,1,this was the first clive cussler i've ever rea...,this was the first clive cussler ive ever read...,this was the first clive cussler ive ever read...
2,1,i liked the Da Vinci Code a lot.,i liked the da vinci code a lot,i liked the da vinci code a lot
3,1,i liked the Da Vinci Code a lot.,i liked the da vinci code a lot,i liked the da vinci code a lot
4,1,I liked the Da Vinci Code but it ultimatly did...,i liked the da vinci code but it ultimatly did...,i liked the da vinci code but it ultimatly did...
5,1,that's not even an exaggeration ) and at midni...,thats not even an exaggeration and at midnigh...,thats not even an exaggeration and at midnigh...
6,1,"I loved the Da Vinci Code, but now I want some...",i loved the da vinci code but now i want somet...,i loved the da vinci code but now i want somet...
7,1,"i thought da vinci code was great, same with k...",i thought da vinci code was great same with ki...,i thought da vinci code was great same with ki...
8,1,The Da Vinci Code is actually a good movie...,the da vinci code is actually a good movie,the da vinci code is actually a good movie
9,1,I thought the Da Vinci Code was a pretty good ...,i thought the da vinci code was a pretty good ...,i thought the da vinci code was a pretty good ...


In [9]:
stopset = set(stopwords.words('english'))

In [10]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

In [11]:
X = vectorizer.fit_transform(dataset.cleaned_comments_new)
y = dataset.Reviews
pickle.dump(vectorizer, open('tranforms.pkl', 'wb'))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Using Naive Bayes For Classification

In [13]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB()

In [14]:
accuracy_score(y_test,clf.predict(X_test))*100

97.47109826589595

In [15]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

MultinomialNB()

In [16]:
accuracy_score(y_test,clf.predict(X_test))*100

98.77167630057804

### Using Logistic Regression For Classification

In [17]:
from sklearn.model_selection import train_test_split

Independent_var = dataset.cleaned_comments_new
Dependent_var = dataset.Reviews

IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))

IV_train : 6226
IV_test  : 692
DV_train : 6226
DV_test  : 692


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")


from sklearn.pipeline import Pipeline

In [19]:
lgclf = Pipeline([('vectorizer',tvec),('classifier',clf2)])

lgclf.fit(IV_train, DV_train)


from sklearn.metrics import confusion_matrix

predictions = lgclf.predict(IV_test)

confusion_matrix(predictions, DV_test)

array([[304,   2],
       [ 11, 375]], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, DV_test))
print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

Accuracy :  0.9812138728323699
Precision :  0.981599046757508
Recall :  0.9812138728323699


In [21]:
#example = ["Ok, so the Da Vinci Code movie sucked incredibly,. ( except for the part were we see Paul Bentlys butt, that was kinda cool.."]
example=["Harry Potter is AWESOME I don't care if anyone says differently!.."]
result = lgclf.predict(example)

print(result)

[0]


In [27]:
#saving the trained model in one file on disk
filename = 'nlp_model2.pkl'
pickle.dump(lgclf, open(filename, 'wb'))