# Imports

In [16]:
#libraries for cleaning text 
import nltk
import string
import math
import numpy as np
import pandas as pd
import contractions
from bs4 import BeautifulSoup

#libraries to create decision tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle

#miscelanious
import os

# Load dataset

In [17]:
reviews_df = pd.read_csv('./data/imdb_dataset.csv',encoding='ISO-8859-1').sample(1000,replace=False)
reviews_df.head()

Unnamed: 0,Review,Label
3166,Few movies have dashed expectations and upset ...,neg
47925,"Well I don't personally like rap, but I still ...",pos
38675,i would have to say that this is the first qua...,pos
895,When a movie's claim to fame is that Martin Sh...,neg
30709,I can't stand most reality shows and this one ...,neg


# Cleaning and transforming data

In [18]:
reviews_df['Label'] = reviews_df['Label'].map(lambda label: 1 if label == "pos" else 0 )
reviews_df.head()

Unnamed: 0,Review,Label
3166,Few movies have dashed expectations and upset ...,0
47925,"Well I don't personally like rap, but I still ...",1
38675,i would have to say that this is the first qua...,1
895,When a movie's claim to fame is that Martin Sh...,0
30709,I can't stand most reality shows and this one ...,0


In [19]:

def normalize_document(text):
    stop_words = set( nltk.corpus.stopwords.words('english')+ list(string.punctuation)+["...","*","''","``"])
    text_without_html = BeautifulSoup(text).get_text()
    words = text_without_html.split() 
    words_without_contractions = [contractions.fix(word) for word in words]
    #Join wordlist again to use word tokenize so words can be separated properly without losing meaning
    text_complete = ' '.join(words_without_contractions)
    words_nltk = nltk.word_tokenize(text_complete)
    clean_words = [word.lower() for word in words_nltk if word.lower() not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text

In [20]:
def normalize_corpus(reviews):
    return np.array([normalize_document(review)for review in reviews])  

In [21]:
reviews_list = reviews_df['Review'].to_list()
normalized_corpus = normalize_corpus(reviews_list)

# Getting problem vocabulary

In [22]:
def get_problem_vocabulary(normalized_corpus):
    all_tokens = [] 
    for document in normalized_corpus:
        all_tokens.extend(document.split())  
    #[all_tokens.extend(document.split()) for document in normalized_corpus]
    all_tokens_sorted = sorted(set(all_tokens))
    
    token_and_position = {}
    for i, token in enumerate(all_tokens_sorted):
        token_and_position[token] = i
    
    return token_and_position

problem_vocabulary = get_problem_vocabulary(normalized_corpus)


#  One hot encoding

In [23]:
def one_hot_vector(document, problem_vocabulary):
    vector = np.zeros(len(problem_vocabulary),dtype=int)
    for token in document.split():
        vector[problem_vocabulary[token]] = 1
    return vector

In [24]:
vocabulary_keys = list(problem_vocabulary.keys())
print(len(vocabulary_keys))

20592


In [25]:
df = pd.DataFrame(columns = vocabulary_keys)
def createOneHotDF():
    for idx,doc in enumerate(normalized_corpus):
        doc_hot_vec = one_hot_vector(doc, problem_vocabulary)
        df.loc[idx] = doc_hot_vec
    return df

In [26]:
df_reviews_oneHot = createOneHotDF()
df_reviews_oneHot

Unnamed: 0,'10,'40s,'45,'60s/'70s,'68,'70s,'78,'79-'80,'80,'80s,...,â,â£1,â£3,â£5,â£50k,â£6,â£7now,â¾,ã,ã©tc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#df_reviews_oneHot.drop(columns=['Label'],inplace=True)
df_reviews_oneHot['Label'] = reviews_df['Label'].to_list()
df_reviews_oneHot.head()
df_reviews_oneHot.to_csv('one_hot_encoding.csv')

# Training model: Decision Tree

In [28]:
feature_cols = vocabulary_keys 
X = df_reviews_oneHot[feature_cols] 
y = df_reviews_oneHot["Label"] 

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 
# 70% training, 30% test

In [30]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 20592) (300, 20592) (700,) (300,)


In [31]:
review_tree_classifier = RandomForestClassifier(n_estimators=50)

In [32]:
review_tree_classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
print('accuracy del clasificador - version 3 : {0:.2f}'.format(accuracy_score(y_test, review_tree_classifier.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 3: \n {0}'.format(confusion_matrix(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('precision del clasificador - version 3 : {0:.2f}'.format(precision_score(y_test, review_tree_classifier.predict(X_test))))
# precision 
print('recall del clasificador - version 3 : {0:.2f}'.format(recall_score(y_test, review_tree_classifier.predict(X_test))))
# f1
print('f1 del clasificador - version 3 : {0:.2f}'.format(f1_score(y_test, review_tree_classifier.predict(X_test))))

accuracy del clasificador - version 3 : 0.76
matriz de confusión del clasificador - version 3: 
 [[105  39]
 [ 34 122]]
precision del clasificador - version 3 : 0.76
recall del clasificador - version 3 : 0.78
f1 del clasificador - version 3 : 0.77


# Save classifier

In [35]:
classifier_path = os.path.join("classifier", "reviewClasiffierTree.pkl")

In [37]:
classifier_file = open(classifier_path, "wb")

In [38]:
pickle.dump(review_tree_classifier, classifier_file)

In [39]:
classifier_file.close()