# Imports

In [1]:
#libraries for cleaning text 
import nltk
import string
import math
import numpy as np
import pandas as pd
import contractions
from bs4 import BeautifulSoup

#libraries to create decision tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle

#miscelanious
import os

# Load dataset

In [2]:
reviews_df = pd.read_csv('./data/imdb_dataset.csv',encoding='ISO-8859-1').sample(10000,replace=False)
reviews_df.head()

Unnamed: 0,Review,Label
2836,In addition to all the negative reviews: I was...,neg
21984,I absolutely loved this movie! It's my number ...,pos
11828,The movie had no excitement and does not have ...,neg
10801,I lasted almost ninety minutes through this dr...,neg
20239,After watching John preform this one of a kind...,pos


# Cleaning and transforming data

In [3]:
reviews_df['Label'] = reviews_df['Label'].map(lambda label: 1 if label == "pos" else 0 )
reviews_df.head()

Unnamed: 0,Review,Label
2836,In addition to all the negative reviews: I was...,0
21984,I absolutely loved this movie! It's my number ...,1
11828,The movie had no excitement and does not have ...,0
10801,I lasted almost ninety minutes through this dr...,0
20239,After watching John preform this one of a kind...,1


In [4]:

def normalize_document(text):
    stop_words = set( nltk.corpus.stopwords.words('english')+ list(string.punctuation)+["...","*","''","``"])
    text_without_html = BeautifulSoup(text).get_text()
    words = text_without_html.split() 
    words_without_contractions = [contractions.fix(word) for word in words]
    #Join wordlist again to use word tokenize so words can be separated properly without losing meaning
    text_complete = ' '.join(words_without_contractions)
    words_nltk = nltk.word_tokenize(text_complete)
    clean_words = [word.lower() for word in words_nltk if word.lower() not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text

In [5]:
def normalize_corpus(reviews):
    return np.array([normalize_document(review)for review in reviews])  

In [6]:
reviews_list = reviews_df['Review'].to_list()
normalized_corpus = normalize_corpus(reviews_list)

#  Count Vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer() #Parametros con valores por defecto
cv_matrix = cv.fit_transform(normalized_corpus) #combina fit y luego trasform
cv_matrix

<10000x53120 sparse matrix of type '<class 'numpy.int64'>'
	with 983955 stored elements in Compressed Sparse Row format>

In [9]:
df = pd.DataFrame(cv_matrix.toarray(), columns=cv.get_feature_names())
df.head()

Unnamed: 0,00,000,00000000000,00000001,000dm,001,007,0079,007s,0080,...,¼anton,¼the,âªsen,âº,â½,â¾,ãµes,ãºber,ã¼ber,četvorka
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#agregando label al dataframe
df['Label'] = reviews_df['Label'].to_list()
df.head()

Unnamed: 0,00,000,00000000000,00000001,000dm,001,007,0079,007s,0080,...,¼the,âªsen,âº,â½,â¾,ãµes,ãºber,ã¼ber,četvorka,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Training model: Decision Tree

In [11]:
feature_cols = cv.get_feature_names() 
X = df[feature_cols] 
y = df["Label"] 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 
# 70% training, 30% test

In [13]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7000, 53120) (3000, 53120) (7000,) (3000,)


In [14]:
review_tree_classifier = RandomForestClassifier(n_estimators=50)

In [15]:
review_tree_classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
print('accuracy del clasificador - version 3 : {0:.2f}'.format(accuracy_score(y_test, review_tree_classifier.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 3: \n {0}'.format(confusion_matrix(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('precision del clasificador - version 3 : {0:.2f}'.format(precision_score(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('recall del clasificador - version 3 : {0:.2f}'.format(recall_score(y_test, review_tree_classifier.predict(X_test))))
# f1
print('f1 del clasificador - version 3 : {0:.2f}'.format(f1_score(y_test, review_tree_classifier.predict(X_test))))

accuracy del clasificador - version 3 : 0.83
matriz de confusión del clasificador - version 3: 
 [[1291  222]
 [ 300 1187]]
precision del clasificador - version 3 : 0.84
recall del clasificador - version 3 : 0.80
f1 del clasificador - version 3 : 0.82


# Save classifier

In [17]:
classifier_path = os.path.join("classifier", "reviewClasiffierTree.pkl")

In [18]:
classifier_file = open(classifier_path, "wb")

In [19]:
pickle.dump(review_tree_classifier, classifier_file)

In [20]:
classifier_file.close()