In [1]:
import nltk
import string
import numpy as np
import pandas as pd
import collections
import math
import contractions
from bs4 import BeautifulSoup
# pip install contractions

In [16]:
reviews_df = pd.read_csv('./data/imdb_dataset.csv',encoding='ISO-8859-1').sample(1000,replace=False)
reviews_df.head()


Unnamed: 0,Review,Label
5912,Aghhhhhh! What a disappointment. A perfectly g...,neg
8585,Ouch. This is one ugly movie. Not only is it b...,neg
28154,Warning Spoilers following. Superb recreation ...,neg
32216,I am a current A.S.L. Student & was forced to ...,neg
46493,"""Dressed to Kill"" is surely one of the best ho...",pos


In [17]:
reviews_df['Label'] = reviews_df['Label'].map(lambda label: 1 if label == "pos" else 0 )
reviews_df.head()

Unnamed: 0,Review,Label
5912,Aghhhhhh! What a disappointment. A perfectly g...,0
8585,Ouch. This is one ugly movie. Not only is it b...,0
28154,Warning Spoilers following. Superb recreation ...,0
32216,I am a current A.S.L. Student & was forced to ...,0
46493,"""Dressed to Kill"" is surely one of the best ho...",1


In [18]:
def normalize_document(text):
    stop_words = set( nltk.corpus.stopwords.words('english')+ list(string.punctuation)+["...","*","''","``"])
    text_without_html = BeautifulSoup(text).get_text()
    words = text_without_html.split() 
    words_without_contractions = [contractions.fix(word) for word in words]
    #Join wordlist again to use word tokenize so words can be separated properly without losing meaning
    text_complete = ' '.join(words_without_contractions)
    words_nltk = nltk.word_tokenize(text_complete)
    clean_words = [word.lower() for word in words_nltk if word.lower() not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text


In [19]:
def normalize_corpus(reviews):
    return np.array([normalize_document(review)for review in reviews])

  

In [20]:

reviews_list = reviews_df['Review'].to_list()

normalized_corpus = normalize_corpus(reviews_list)



In [21]:
def get_problem_vocabulary(normalized_corpus):
    all_tokens = [] 
    for document in normalized_corpus:
        all_tokens.extend(document.split())  
    #[all_tokens.extend(document.split()) for document in normalized_corpus]
    all_tokens_sorted = sorted(set(all_tokens))
    
    token_and_position = {}
    for i, token in enumerate(all_tokens_sorted):
        token_and_position[token] = i
    
    return token_and_position

problem_vocabulary = get_problem_vocabulary(normalized_corpus)


In [22]:
def one_hot_vector(document, problem_vocabulary):
    vector = np.zeros(len(problem_vocabulary),dtype=int)
    for token in document.split():
        vector[problem_vocabulary[token]] = 1
    return vector

In [23]:
vocabulary_keys = list(problem_vocabulary.keys())


In [24]:
def createOneHotDF():
    df = pd.DataFrame(columns = vocabulary_keys)
    for idx,doc in enumerate(normalized_corpus):
        doc_hot_vec = one_hot_vector(doc, problem_vocabulary)
        df.loc[idx] = doc_hot_vec
        
    return df
        

In [25]:
df_reviews_oneHot = createOneHotDF()
df_reviews_oneHot

Unnamed: 0,''wallace,'01,'03,'30s,'30s/'40s,'40s,'50s,'60s,'73,'80s,...,~t.paul,¦although,«,»,â,"â£8,000",â¾,ã,ã¨,ã©tait
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [50]:
#df_reviews_oneHot.drop(columns=['Label'],inplace=True)
df_reviews_oneHot['Label'] = reviews_df['Label'].to_list()
df_reviews_oneHot.head()


Unnamed: 0,''wallace,'01,'03,'30s,'30s/'40s,'40s,'50s,'60s,'73,'80s,...,¦although,«,»,â,"â£8,000",â¾,ã,ã¨,ã©tait,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
