In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords 
from collections import Counter
from joblib import dump, load
import nltk 
import pandas as pd
import numpy as np
import collections
import regex 



In [2]:
ratings = pd.read_csv('../data/ratings.csv')
ratings.head()

Unnamed: 0,TEXT,RATING
0,The shirt was more of a smock. I expected a so...,1
1,The shirt was more of a smock. I expected a so...,1
2,The shirt was more of a smock. I expected a so...,1
3,Just received my order today. When I opened th...,1
4,"First of all, the button hole on this belt nev...",1


In [3]:
ratings.describe()

Unnamed: 0,RATING
count,10000.0
mean,3.0
std,1.414284
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


## Labels Analysis
From counting the labels we can see that the dataset is perfectly balanced

In [3]:
labels = np.array(ratings['RATING']) 
ocurrences = collections.Counter(labels)
ocurrences

Counter({1: 2000, 2: 2000, 3: 2000, 5: 2000, 4: 2000})

## Variables analysis
To analyse the words we need a tokenizer that can separate each sentence into a useful list of words

**Must do: use an already developed tokenizer**
**Must do: remove any special phrases in the reviews that start with {}**

In [4]:
# The first tokenizer is a simple python split, let's try it with an arbitrary sentence 
# 
# An obvious problem with this methos is the inclusion of the '.' character in some words, 
# this can increase a lot the dictionary size of the dataset

reviews = np.array(ratings['TEXT']) 
sample_review = reviews[42]
sample_review

"Bloody awful. I got to wash these twice on gentle cycle and already they're full of holes. So much for style and comfort...I got to wear these twice and they look like I was standing too close to a fire already. Swiss cheese pants!"

In [6]:
sample_review = reviews[42]
sample_review = np.array(sample_review.split()) 
sample_review

array(['Bloody', 'awful.', 'I', 'got', 'to', 'wash', 'these', 'twice',
       'on', 'gentle', 'cycle', 'and', 'already', "they're", 'full', 'of',
       'holes.', 'So', 'much', 'for', 'style', 'and', 'comfort...I',
       'got', 'to', 'wear', 'these', 'twice', 'and', 'they', 'look',
       'like', 'I', 'was', 'standing', 'too', 'close', 'to', 'a', 'fire',
       'already.', 'Swiss', 'cheese', 'pants!'], dtype='<U11')

In [7]:
# Lets try a more complex method , now the useess puntuation marks are gone
# although we now have abbreviation issues such as wasn't  
sample_review = reviews[42]
sample_review = np.array(regex.split(r'[-\s.,;!?]+', reviews[42]))
sample_review

array(['Bloody', 'awful', 'I', 'got', 'to', 'wash', 'these', 'twice',
       'on', 'gentle', 'cycle', 'and', 'already', "they're", 'full', 'of',
       'holes', 'So', 'much', 'for', 'style', 'and', 'comfort', 'I',
       'got', 'to', 'wear', 'these', 'twice', 'and', 'they', 'look',
       'like', 'I', 'was', 'standing', 'too', 'close', 'to', 'a', 'fire',
       'already', 'Swiss', 'cheese', 'pants', ''], dtype='<U8')

In [8]:
# An ever more complete tokenizer is the one included in the NLTK library 
sample_review = reviews[42]
tokenizer = TreebankWordTokenizer()
sample_review = np.array(tokenizer.tokenize(sample_review))
sample_review

array(['Bloody', 'awful.', 'I', 'got', 'to', 'wash', 'these', 'twice',
       'on', 'gentle', 'cycle', 'and', 'already', 'they', "'re", 'full',
       'of', 'holes.', 'So', 'much', 'for', 'style', 'and', 'comfort',
       '...', 'I', 'got', 'to', 'wear', 'these', 'twice', 'and', 'they',
       'look', 'like', 'I', 'was', 'standing', 'too', 'close', 'to', 'a',
       'fire', 'already.', 'Swiss', 'cheese', 'pants', '!'], dtype='<U8')

In [8]:
# Now we will apply case folding to reduce the vocabulary 
sample_review = np.array([token.lower() for token in sample_review])
sample_review

array(['bloody', 'awful.', 'i', 'got', 'to', 'wash', 'these', 'twice',
       'on', 'gentle', 'cycle', 'and', 'already', 'they', "'re", 'full',
       'of', 'holes.', 'so', 'much', 'for', 'style', 'and', 'comfort',
       '...', 'i', 'got', 'to', 'wear', 'these', 'twice', 'and', 'they',
       'look', 'like', 'i', 'was', 'standing', 'too', 'close', 'to', 'a',
       'fire', 'already.', 'swiss', 'cheese', 'pants', '!'], dtype='<U8')

**Important: I might want to remove any words with dots because they creating and unnecessary bigger vocabulary**

*Note: We might not remove stop words, more research is required*

In [9]:
# Now for an extra preprocessign we want to remove stop words 
nltk.download('stopwords')
stop_words = stopwords.words('english')
sample_review = np.array([token for token in sample_review if token not in stop_words]) 
sample_review

[nltk_data] Downloading package stopwords to /home/pol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array(['Bloody', 'awful.', 'I', 'got', 'wash', 'twice', 'gentle', 'cycle',
       'already', "'re", 'full', 'holes.', 'So', 'much', 'style',
       'comfort', '...', 'I', 'got', 'wear', 'twice', 'look', 'like', 'I',
       'standing', 'close', 'fire', 'already.', 'Swiss', 'cheese',
       'pants', '!'], dtype='<U8')

In [10]:
encoder = OneHotEncoder()
encodings = encoder.fit_transform(sample_review.reshape(1, -1)) 
encodings.toarray()

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

**Before going any further we will perform a split train/test split to remove  my personal bias on the dataset**

In [5]:
# I want to be sure that the datasets stay balanced to help the model into generalizing 
# now we will only work with the training set and we will never look the eval set 
# again 
X_train, X_eval, y_train, y_eval = train_test_split(reviews, labels, test_size=0.2, random_state=42)


**Must do: Add the relationship between that average length of a sentence and the number of words that are included in the dictionary, this will provide some information on the probability of a word in each sentence, also, analyse the bago of words and create a frequency table for each word to describe the frequency of a word in all the corpus**

In [34]:
# now lets analyse the data further by calculating some global characteristics 
# of each sentence, such as the avg length. This is just an estimate becasuse the 
# sentence lenght dependes directly on the tokenizer we are using
list_of_words = np.array(list(map(sentence_preprocessing, X_train)))
reviews_lenghts = np.array([len(review) for review in list_of_words]) 
print('mean:', reviews_lenghts.mean())
print('std:', reviews_lenghts.std())
print('min',reviews_lenghts.min())
print('max', reviews_lenghts.max())

mean: 26.283875
std: 19.62603602321098
min 1
max 219


There are some sentences with a extremely low nubmer of words in it (i.e. 1 token). We should explore this extreme cases and evaluate if their are outliers that must be removed from the training dataset

In [35]:
print('Review:', X_train[reviews_lenghts.argmin()]) 
print('Label:', y_train[reviews_lenghts.argmin()])

Review: I am 5'8\
Label: 1


In [36]:
# REVIEW OUTLIERS WITH LENGTH LESS THAN A CERTAIN x (E.G. 2)

In [6]:
def sentence_preprocessing(sentence):
    """
    Given a sentece, it returns a list 
    of tokens that have been preprocessed and filtered 
    using multiple strategies. 
    """
    # First we create the list of tokens 
    tokenizer = TreebankWordTokenizer()
    sentence = tokenizer.tokenize(sentence)
    # We then apply case folding 
    sentence = [token.lower() for token in sentence]
    return [token for token in sentence if token not in stop_words] 


def create_bow(data):
    """
    Given a list of sentences it creates and returns a 
    dataframe with prepared for training 
    """

    # Now lets create a bag of words using a simple split 
    bow = []
    for sentence in data:
        bow.append(Counter(sentence_preprocessing(sentence)))
    
    # Create a dataframe with all the records from the bag of words 
    df = pd.DataFrame.from_records(bow)
    df = df.fillna(0).astype(int)
    return df 

The Following dataframe represents the training dataset with the vocabulary size, the training set has a lenght of 8000 (80 percent of the original dataset). And a vocabulary of 7964 words 

In [7]:
df = create_bow(X_train)
print('Shape:', df.shape)

Shape: (8000, 8079)


In [39]:
df

Unnamed: 0,need,bra,minimizes,!,more.,although,back,cup,size,",",...,one\nsmall,join,51,wrinkly.,lighting,i.,haute,couture,teens,woman.\nretuned
0,2,2,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,2,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7996,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7997,0,2,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7998,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Missing study on the dataframe...


In [41]:
# Study term frequencies 
bow = []
for sentence in X_train:
    bow.append(Counter(sentence_preprocessing(sentence)))

bow

[Counter({'need': 2,
          'bra': 2,
          'minimizes': 1,
          '!': 1,
          'more.': 1,
          'although': 1,
          'back': 1,
          'cup': 1,
          'size': 1,
          ',': 2,
          'find': 1,
          'straps': 1,
          'could': 1,
          'go': 1,
          'little': 1,
          'tighter': 1,
          'even': 1,
          'though': 1,
          'tightest': 1,
          "'s": 1,
          'tight': 1,
          'would': 1,
          'prefer': 1,
          'still': 1,
          'works': 1,
          '.': 1}),
 Counter({'shoulder': 1,
          'straps': 1,
          'roll': 1,
          'time': 1,
          'making': 1,
          'uncomfortable': 1,
          '.': 1}),
 Counter({'bali': 2,
          'bra': 2,
          '38dd.': 1,
          'favorite': 1,
          'ever': 1,
          '!': 2,
          'ordered': 1,
          '3': 1,
          'site': 1,
          ',': 1,
          'received': 1,
          '(': 1,
          'marked': 1,


# Model Training 

In [8]:
classifier = MultinomialNB()
classifier = classifier.fit(df, y_train)

In [9]:
y_pred = classifier.predict(df)
accuracy_score(y_train,y_pred)

0.861875

**Important, the new dataset has aproximately 300 new tokens that were not in the training set, that might afect the perfomance on the test set**

In [10]:
# We need to make sure that columns from the training dataframe correspond the the exact same 
# cols in the test dataset 
df_test = create_bow(X_eval)
print('Shape:', df_test.shape)
all_bows = df.append(df_test)
print('New bigger shape:', all_bows.shape)
df_test = all_bows.iloc[len(df):][df.columns] # select only the test samples and the training cols 
print('New df with filtered cols shape:', df_test.shape)

Shape: (2000, 5498)
New bigger shape: (10000, 8420)
New df with filtered cols shape: (2000, 8079)


In [11]:
df_test = df_test.fillna(0).astype(int)
y_pred = classifier.predict(df_test)
accuracy_score(y_eval,y_pred)

0.7645

In [12]:
print('Recall score:', recall_score(y_eval, y_pred, average='micro')) 
print('Precision score:', precision_score(y_eval, y_pred, average='micro'))

Recall score: 0.7645
Precision score: 0.7645


In [47]:
dump(classifier, '../data/classifier.joblib')

['../data/classifier.joblib']

In [25]:
classifier = load('../data/classifier.joblib')

In [26]:
y_pred = classifier.predict(df_test)
accuracy_score(y_eval,y_pred)

0.779

In [49]:
df_test.iloc[:2]

Unnamed: 0,need,bra,minimizes,!,more.,although,back,cup,size,",",...,one\nsmall,join,51,wrinkly.,lighting,i.,haute,couture,teens,woman.\nretuned
0,0,0,0,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
df_test.iloc[:2].to_csv('../data/dataframe_sample.csv', index=False)