# Imports

In [27]:
#libraries for cleaning text 
import nltk
import string
import math
import numpy as np
import pandas as pd
import contractions
from bs4 import BeautifulSoup

#libraries to create decision tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle

#miscelanious
import os

# Load dataset

In [65]:
reviews_df = pd.read_csv('./data/imdb_dataset.csv',encoding='ISO-8859-1').sample(20000,replace=False)
reviews_df.head()

Unnamed: 0,Review,Label
14061,"All you need is great house, a babysitter and ...",pos
25296,This piece ain't really worth a comment.. It's...,neg
48425,Sergeant Ryker is accused of being a traitor d...,pos
5210,It is OK movie if it would be done by high sch...,neg
29613,Spoiler: Bunch of passive-aggressive people ha...,neg


# Cleaning and transforming data

In [66]:
reviews_df['Label'] = reviews_df['Label'].map(lambda label: 1 if label == "pos" else 0 )
reviews_df.head()

Unnamed: 0,Review,Label
14061,"All you need is great house, a babysitter and ...",1
25296,This piece ain't really worth a comment.. It's...,0
48425,Sergeant Ryker is accused of being a traitor d...,1
5210,It is OK movie if it would be done by high sch...,0
29613,Spoiler: Bunch of passive-aggressive people ha...,0


In [67]:

def normalize_document(text):
    stop_words = set( nltk.corpus.stopwords.words('english')+ list(string.punctuation)+["...","*","''","``"])
    text_without_html = BeautifulSoup(text).get_text()
    words = text_without_html.split() 
    words_without_contractions = [contractions.fix(word) for word in words]
    #Join wordlist again to use word tokenize so words can be separated properly without losing meaning
    text_complete = ' '.join(words_without_contractions)
    words_nltk = nltk.word_tokenize(text_complete)
    clean_words = [word.lower() for word in words_nltk if word.lower() not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text

In [68]:
def normalize_corpus(reviews):
    return np.array([normalize_document(review)for review in reviews])  

In [69]:
reviews_list = reviews_df['Review'].to_list()
normalized_corpus = normalize_corpus(reviews_list)

#  Count Vectorizer

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

In [71]:
cv = CountVectorizer() #Parametros con valores por defecto
cv_matrix = cv.fit_transform(normalized_corpus) #combina fit y luego trasform
cv_matrix

<20000x71716 sparse matrix of type '<class 'numpy.int64'>'
	with 1987308 stored elements in Compressed Sparse Row format>

In [72]:
df = pd.DataFrame(cv_matrix.toarray(), columns=cv.get_feature_names())
df.head()

MemoryError: 

In [37]:
#agregando label al dataframe
df['Label'] = reviews_df['Label'].to_list()
df.head()

Unnamed: 0,00,000,007,0079,007s,0080,0083,009,00o,00pm,...,â½,â¾,ãªtre,ã³r,ã¼ber,ã¼bermensch,ã½,ã½s,ři,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Training model: Decision Tree

In [38]:
feature_cols = cv.get_feature_names() 
X = df[feature_cols] 
y = df["Label"] 

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 
# 70% training, 30% test

In [40]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7000, 52799) (3000, 52799) (7000,) (3000,)


In [41]:
review_tree_classifier = RandomForestClassifier(n_estimators=50)

In [42]:
review_tree_classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
print('accuracy del clasificador - version 3 : {0:.2f}'.format(accuracy_score(y_test, review_tree_classifier.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 3: \n {0}'.format(confusion_matrix(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('precision del clasificador - version 3 : {0:.2f}'.format(precision_score(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('recall del clasificador - version 3 : {0:.2f}'.format(recall_score(y_test, review_tree_classifier.predict(X_test))))
# f1
print('f1 del clasificador - version 3 : {0:.2f}'.format(f1_score(y_test, review_tree_classifier.predict(X_test))))

accuracy del clasificador - version 3 : 0.82
matriz de confusión del clasificador - version 3: 
 [[1183  274]
 [ 259 1284]]
precision del clasificador - version 3 : 0.82
recall del clasificador - version 3 : 0.83
f1 del clasificador - version 3 : 0.83


# Save classifier

In [44]:
classifier_path = os.path.join("classifier", "reviewClasiffierTree.pkl")

In [45]:
classifier_file = open(classifier_path, "wb")

In [46]:
pickle.dump(review_tree_classifier, classifier_file)

In [47]:
classifier_file.close()