# Imports

In [30]:
#libraries for cleaning text 
import nltk
import string
import math
import numpy as np
import pandas as pd
import contractions
from bs4 import BeautifulSoup

#libraries to create decision tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle

#miscelanious
import os

# Load dataset

In [31]:
reviews_df = pd.read_csv('./data/imdb_dataset.csv',encoding='ISO-8859-1').sample(10000,replace=False)
reviews_df.head()

Unnamed: 0,Review,Label
11899,Before I saw this movie I believed there were ...,neg
33646,The film begins with people on Earth discoveri...,neg
36747,What did I just watch? I spent 90 minutes of m...,neg
16870,This is the best comedy period. It is so under...,pos
48389,"Ti%s and As*, lots of boobies. Some great char...",pos


# Cleaning and transforming data

In [32]:
reviews_df['Label'] = reviews_df['Label'].map(lambda label: 1 if label == "pos" else 0 )
reviews_df.head()

Unnamed: 0,Review,Label
11899,Before I saw this movie I believed there were ...,0
33646,The film begins with people on Earth discoveri...,0
36747,What did I just watch? I spent 90 minutes of m...,0
16870,This is the best comedy period. It is so under...,1
48389,"Ti%s and As*, lots of boobies. Some great char...",1


In [33]:
def normalize_document(text):
    stop_words = set( nltk.corpus.stopwords.words('english')+ list(string.punctuation)+["...","*","''","``"])
    text_without_html = BeautifulSoup(text).get_text()
    words = text_without_html.split() 
    words_without_contractions = [contractions.fix(word) for word in words]
    #Join wordlist again to use word tokenize so words can be separated properly without losing meaning
    text_complete = ' '.join(words_without_contractions)
    words_nltk = nltk.word_tokenize(text_complete)
    clean_words = [word.lower() for word in words_nltk if word.lower() not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text

In [34]:
def normalize_corpus(reviews):
    return np.array([normalize_document(review)for review in reviews])  

In [35]:
reviews_list = reviews_df['Review'].to_list()
normalized_corpus = normalize_corpus(reviews_list)

# Getting problem vocabulary

In [36]:
def get_problem_vocabulary(normalized_corpus):
    all_tokens = [] 
    for document in normalized_corpus:
        all_tokens.extend(document.split())  
    #[all_tokens.extend(document.split()) for document in normalized_corpus]
    all_tokens_sorted = sorted(set(all_tokens))
    
    token_and_position = {}
    for i, token in enumerate(all_tokens_sorted):
        token_and_position[token] = i
    
    return token_and_position

problem_vocabulary = get_problem_vocabulary(normalized_corpus)


#  One hot encoding

In [37]:
def one_hot_vector(document, problem_vocabulary):
    vector = np.zeros(len(problem_vocabulary),dtype=int)
    for token in document.split():
        vector[problem_vocabulary[token]] = 1
    return vector

In [38]:
vocabulary_keys = list(problem_vocabulary.keys())


In [39]:
def createOneHotDF():
    df = pd.DataFrame(columns = vocabulary_keys)
    for idx,doc in enumerate(normalized_corpus):
        doc_hot_vec = one_hot_vector(doc, problem_vocabulary)
        df.loc[idx] = doc_hot_vec
    return df

In [None]:
df_reviews_oneHot = createOneHotDF()
df_reviews_oneHot

In [14]:
#df_reviews_oneHot.drop(columns=['Label'],inplace=True)
df_reviews_oneHot['Label'] = reviews_df['Label'].to_list()
df_reviews_oneHot.head()

Unnamed: 0,'-die,'-movies,'1947,'20th,'30s/'40s,'40,'50s,'50s-'60s,'51,'60,...,zurer,zzzzz,~steven,â,â¡can,â£10.00,â¨scandal,â´when,ã,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Training model: Decision Tree

In [16]:
feature_cols = vocabulary_keys 
X = df_reviews_oneHot[feature_cols] 
y = df_reviews_oneHot["Label"] 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 
# 70% training, 30% test

In [18]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 21280) (300, 21280) (700,) (300,)


In [27]:
review_tree_classifier = RandomForestClassifier(n_estimators=200)

In [28]:
review_tree_classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
print('accuracy del clasificador - version 3 : {0:.2f}'.format(accuracy_score(y_test, tree_v3.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 3: \n {0}'.format(confusion_matrix(y_test, tree_v3.predict(X_test))))
# precision 
print('precision del clasificador - version 3 : {0:.2f}'.format(precision_score(y_test, tree_v3.predict(X_test))))
# precision 
print('recall del clasificador - version 3 : {0:.2f}'.format(recall_score(y_test, tree_v3.predict(X_test))))
# f1
print('f1 del clasificador - version 3 : {0:.2f}'.format(f1_score(y_test, tree_v3.predict(X_test))))

accuracy del clasificador - version 3 : 0.82
matriz de confusión del clasificador - version 3: 
 [[129  34]
 [ 21 116]]
precision del clasificador - version 3 : 0.77
recall del clasificador - version 3 : 0.85
f1 del clasificador - version 3 : 0.81


# Save classifier

In [None]:
classifier_path = os.path.join("classifier", "reviewClasiffierTree.pkl")

In [None]:
classifier_file = open(classifier_path, "wb")

In [None]:
pickle.dump(review_tree_classifier, classifier_file)

In [None]:
classifier_file.close()