In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv("C:/Users/19495/DS3ProjectFiles/amazon_reviews_us_Toys_v1_00.tsv.gz", names =  columns, sep = '\t').iloc[1:,:]

In [5]:
elec_df = elec_df.sample(n = 1_000_000)

In [6]:
one_file = elec_df.copy()

## Data Sampling

In [7]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [8]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 827245
Number of unverified purchases: 172755
Number of verified purchases (balanced dataset): 172755
Number of unverified purchases (balanced dataset): 172755


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
402388,US,16462487,R1H4BGLVN50LFW,B00V8I8JU6,814631691,"Funko Marvel Avengers Age of Ultron POP! Marvel Vision Exclusive 3 3/4"" Vinyl Bobble Head #71 [Phasing]",Toys,5,0,1,N,N,nice pop!,I got this pop at my local target for 9 bucks they had 8 of them laying around so i got the least damage and i have to said this is a nice vision better then the regular version it looks really badass next to my hulkbuster and anvengers team.,2015-06-16
4698221,US,16843662,R1S9MXMITY293E,B000JCSS4O,506509347,2-Channel RC Super Sonic Radio Control Airplane,Toys,4,82,91,N,N,Not for beginner pilots,"I was looking for cheap twin engine RC airplane for some time and found this the best value for the money. It really flies and will survive hard crashes but it is not an airplane for beginners. The radio allows controlling the RPM of the engines in only 3 steps: off, low and high. Left-right direction is controlled only like full left - straight - full right. With the direction this is not such a problem but doing the same with the speed so that the plane is not going up or down but straight can be a challenge. If you can fly RC aircraft and are looking for something cheap which flyes on little space and does not need any maintenance, this can be interesting. I am not expert but I can fly an RC airplane with proportional control reasonably well. I had hard time to keep this in the air at first. It can be learned but if you have never flown anything, this one will be too fast and requires the pilot to do too many things just to keep flying straight. If you are serious about RC planes get something with proportional radio - which will probably start at about $100. If you want toy for kids, this is doubtfull because it isn't much fun to crash all the time and eventually something will break - though the construction is tough and survived already several hard crashes. In this category of 2-engine toy-planes this is good choice. The price is nice. There are airplanes with the same parameters for $40 and even more.",2007-08-21


In [9]:
print("The Beauty sample will have size:", balanced_elec.shape)

The Beauty sample will have size: (345510, 15)


## Data Cleaning & Type Conversion

In [10]:
#one_file = balanced_elec.copy()

In [11]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 827245
Number of unverified purchases (balanced dataset): 172755


In [12]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5      624178
4      156188
1       81082
3.0     78588
2       46589
5        8942
4        1774
1        1073
3         965
2         621
Name: star_rating, dtype: int64

In [13]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [14]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [16]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
3624415,My four year old son is a super hero fanatic and he loves his Flash. It is durable (the parts move well) and the best part- NO Batteries! If you have a Flash fan in the &#34;fam&#34; then your won't be disappointed.,four year old son super hero fanatic love flash durable part move well best part battery flash fan fam disappointed
2796050,"So, DQ4 Hero is a very nice looking figure. It is a decent size, and the accessories are cool, and accurate. As a big fan of Dragon Quest, and especially DQ4-6 on the Nintendo DS, I very much like what I got, but...<br /><br />He doesn't stand so great. Many of his joints are iffy, and posing can be tricky. His shoulder pieces are literally held on by hope, and the slightest bump can knock them loose. They don't snap back on so much as hold on by a hair, and beg the air not to pop them off, again. The sword also fits loose in the hand.<br /><br />The rest is nice. The Zenithian Sword and Shield are good looking, and the armor seems right. Overall, i very much like him. It's a piece I've been looking for since I saw some years ago, and the sets on racks. very cool.",so dq hero nice looking figure decent size accessory cool accurate big fan dragon quest especially dq nintendo d much like got but stand great many joint iffy posing tricky shoulder piece literally held hope slightest bump knock loose snap back much hold hair beg air pop off again sword also fit loose hand rest nice zenithian sword shield good looking armor seems right overall much like him piece ive looking since saw year ago set rack cool
1873472,Did fit American Girl Sage,fit american girl sage
3010475,"Worked well for what i needed. I painted them for the team colors i needed but the football post melted... It wasn't even that close to the candle, so just a warning on that.",worked well needed painted team color needed football post melted even close candle warning that
1410726,"This toy was for a little boy I got as a Secret Santa and I hope he loved it. Lego always outdoes themselves with amazing playsets, specifically how they treat Star Wars. I wish I was able to see the look on the boy's face when he got it.",toy little boy got secret santa hope loved it lego always outdoes amazing playsets specifically treat star war wish able see look boy face got it


In [17]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
3624415,Great Action Figure,great action figure
2796050,Nice but Unstable,nice unstable
1873472,Five Stars,five star
3010475,football post melts if it has any heat of a candle but otherwise good,football post melt heat candle otherwise good
1410726,... got as a Secret Santa and I hope he loved it. Lego always outdoes themselves with amazing playsets,got secret santa hope loved it lego always outdoes amazing playsets


In [18]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
3624415,DC Collectibles Justice League: The Flash Action Figure,dc collectible justice league flash action figure
2796050,Dragon Quest: Legend Armor Returns -Equipment of Sky- (PVC Figure) by Square Enix,dragon quest legend armor return equipment sky pvc figure square enix
1873472,"18 Inch Doll Riding Boots for Horse Riding Doll Shoes, Fits 18 Inch American Girl Dolls, Detailed with Laces, Classic Black Riding Boots",inch doll riding boot horse riding doll shoe fit inch american girl doll detailed lace classic black riding boot
3010475,Football Cake Toppers 10ct,football cake topper ct
1410726,LEGO Star Wars 75052 Mos Eisley Cantina Building Toy (Discontinued by manufacturer),lego star war mo eisley cantina building toy discontinued manufacturer


In [19]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 827160
Number of unverified purchases (balanced dataset): 172743


## Vader Sentiment Analysis

In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [21]:
one_file = cleaned2.copy()

In [22]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [23]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [24]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 827160
Number of unverified purchases (balanced dataset): 172743


## Confusion Matrix

In [25]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [27]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [28]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [29]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [30]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [31]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [32]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
3624415,US,28950275,R3B1DH2XRT5QTW,B007WYU7R8,511284143,DC Collectibles Justice League: The Flash Action Figure,Toys,5,0,0,N,Y,Great Action Figure,My four year old son is a super hero fanatic and he loves his Flash. It is durable (the parts move well) and the best part- NO Batteries! If you have a Flash fan in the &#34;fam&#34; then your won't be disappointed.,2013-02-24,four year old son super hero fanatic love flash durable part move well best part battery flash fan fam disappointed,great action figure,dc collectible justice league flash action figure,"{'neg': 0.085, 'neu': 0.357, 'pos': 0.558, 'compound': 0.9531}",0.085,0.357,0.558,0.9531,0.5267,0.6249,0.0,1,-1
2796050,US,43074882,RN8NV7EA0VERA,B009ID1VYO,240692148,Dragon Quest: Legend Armor Returns -Equipment of Sky- (PVC Figure) by Square Enix,Toys,4,0,0,N,Y,Nice but Unstable,"So, DQ4 Hero is a very nice looking figure. It is a decent size, and the accessories are cool, and accurate. As a big fan of Dragon Quest, and especially DQ4-6 on the Nintendo DS, I very much like what I got, but...<br /><br />He doesn't stand so great. Many of his joints are iffy, and posing can be tricky. His shoulder pieces are literally held on by hope, and the slightest bump can knock them loose. They don't snap back on so much as hold on by a hair, and beg the air not to pop them off, again. The sword also fits loose in the hand.<br /><br />The rest is nice. The Zenithian Sword and Shield are good looking, and the armor seems right. Overall, i very much like him. It's a piece I've been looking for since I saw some years ago, and the sets on racks. very cool.",2014-01-22,so dq hero nice looking figure decent size accessory cool accurate big fan dragon quest especially dq nintendo d much like got but stand great many joint iffy posing tricky shoulder piece literally held hope slightest bump knock loose snap back much hold hair beg air pop off again sword also fit loose hand rest nice zenithian sword shield good looking armor seems right overall much like him piece ive looking since saw year ago set rack cool,nice unstable,dragon quest legend armor return equipment sky pvc figure square enix,"{'neg': 0.071, 'neu': 0.563, 'pos': 0.366, 'compound': 0.9855}",0.071,0.563,0.366,0.9855,0.0,0.0772,0.0,0,-1
1873472,US,13251972,RFXWA5876QUDK,B00CBJWYKC,788827203,"18 Inch Doll Riding Boots for Horse Riding Doll Shoes, Fits 18 Inch American Girl Dolls, Detailed with Laces, Classic Black Riding Boots",Toys,5,1,1,N,Y,Five Stars,Did fit American Girl Sage,2014-11-01,fit american girl sage,five star,inch doll riding boot horse riding doll shoe fit inch american girl doll detailed lace classic black riding boot,"{'neg': 0.0, 'neu': 0.545, 'pos': 0.455, 'compound': 0.3612}",0.0,0.545,0.455,0.3612,0.3612,0.0,1.0,0,1
3010475,US,17288428,R2GEH8BX00N7N6,B001QF359Q,26289888,Football Cake Toppers 10ct,Toys,4,0,0,N,Y,football post melts if it has any heat of a candle but otherwise good,"Worked well for what i needed. I painted them for the team colors i needed but the football post melted... It wasn't even that close to the candle, so just a warning on that.",2013-12-20,worked well needed painted team color needed football post melted even close candle warning that,football post melt heat candle otherwise good,football cake topper ct,"{'neg': 0.137, 'neu': 0.743, 'pos': 0.12, 'compound': -0.0772}",0.137,0.743,0.12,-0.0772,0.0,0.4404,0.0,0,-1
1410726,US,30141913,R1EIEKPTICBORF,B00J4S9CU8,157713200,LEGO Star Wars 75052 Mos Eisley Cantina Building Toy (Discontinued by manufacturer),Toys,5,0,0,N,Y,... got as a Secret Santa and I hope he loved it. Lego always outdoes themselves with amazing playsets,"This toy was for a little boy I got as a Secret Santa and I hope he loved it. Lego always outdoes themselves with amazing playsets, specifically how they treat Star Wars. I wish I was able to see the look on the boy's face when he got it.",2015-01-04,toy little boy got secret santa hope loved it lego always outdoes amazing playsets specifically treat star war wish able see look boy face got it,got secret santa hope loved it lego always outdoes amazing playsets,lego star war mo eisley cantina building toy discontinued manufacturer,"{'neg': 0.098, 'neu': 0.501, 'pos': 0.401, 'compound': 0.9022}",0.098,0.501,0.401,0.9022,-0.5994,0.891,0.0,0,-1


In [33]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [34]:
imp_col = imp_col.dropna()

In [35]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [36]:
X

array([[ 0.5267,  5.    ,  0.6249,  1.    , -1.    ],
       [ 0.    ,  4.    ,  0.0772,  0.    , -1.    ],
       [ 0.3612,  5.    ,  0.    ,  0.    ,  1.    ],
       ...,
       [ 0.    ,  5.    ,  0.    ,  1.    ,  1.    ],
       [ 0.    ,  5.    ,  0.5106,  0.    , -1.    ],
       [ 0.4939,  5.    ,  0.    ,  1.    , -1.    ]])

In [37]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 827160
Number of unverified purchases (balanced dataset): 172743


In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [107]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=1,
                     weights='uniform')

In [108]:
y_pred = knn_classifier.predict(X_test)

In [109]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [110]:
print(cm)
print(ac)

[[  1062  33565]
 [  1685 163669]]
0.8237332546591927


In [407]:
from joblib import dump, load

In [409]:
name = 'knn_toys_model.joblib'
dump(knn_classifier, name)

['knn_toys_model.joblib']

In [411]:
knn_classifier = load(name)

## Getting Reviews

In [415]:
# Getting the indices of the reviews that were predicted correctly
ind1 = list((y_test == y_pred)& (y_pred== 'N'))#.index(True)
# & (y_test == 'N')
[i for i, x in enumerate(ind1) if x]

[70,
 418,
 535,
 767,
 867,
 987,
 1280,
 1284,
 1705,
 2083,
 2098,
 2346,
 3153,
 3221,
 3551,
 4142,
 4593,
 4750,
 4795,
 4949,
 5033,
 5121,
 5266,
 5330,
 5918,
 5947,
 6036,
 6474,
 6648,
 6694,
 6697,
 6828,
 7142,
 7234,
 7400,
 7425,
 7590,
 7769,
 7776,
 7854,
 8000,
 8064,
 8078,
 8131,
 8293,
 8406,
 9108,
 9174,
 9335,
 9479,
 9674,
 9711,
 9892,
 9907,
 10234,
 10710,
 10734,
 10827,
 11089,
 11107,
 11481,
 11590,
 11666,
 11707,
 11722,
 11848,
 11878,
 11934,
 11988,
 12622,
 12672,
 12917,
 13016,
 13022,
 13175,
 13236,
 13720,
 13768,
 13829,
 13830,
 14333,
 14365,
 14387,
 14604,
 14996,
 15045,
 15147,
 15261,
 15394,
 15532,
 15581,
 15724,
 15926,
 16019,
 16100,
 16517,
 16552,
 16578,
 17580,
 18616,
 18869,
 19090,
 19115,
 19171,
 19556,
 19627,
 19686,
 19944,
 20009,
 20190,
 20220,
 20386,
 20404,
 20466,
 20638,
 20850,
 20995,
 21016,
 21468,
 21753,
 21858,
 21982,
 22183,
 22683,
 22689,
 22783,
 22785,
 22843,
 22979,
 23033,
 23469,
 23624,
 2393

In [416]:
ind1 = 4142
#1280
X_test[ind1]

array([ 0.1531,  1.    , -0.4404,  0.    ,  1.    ])

In [417]:
test = list(X)
test;

In [418]:
test2 = (test == X_test[ind1]) 
test2

array([[False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False,  True,  True],
       ...,
       [False, False, False, False,  True],
       [False, False, False,  True, False],
       [False, False, False, False, False]])

In [378]:
ind2 = list(np.all(test2, axis = 1))#.index(True)
ind2
n_lst = [i for i, x in enumerate(ind2) if x]
(n_lst)

[740689]

In [379]:
one_file.iloc[740689].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]

Unnamed: 0,4820799
review_body,"I bought this for my 8 year old daughter to use at her slumber party. What a waste. I had to do most of the assembly, and even then they did not turn out. The material was very poor quality and did not even cover the braclets. The glue was useless and would not stick. This was a huge disappointment to all of us."
review_headline,worthless toy
product_title,Slap Bracelets
star_rating,1
helpful_votes,17
total_votes,17
verified_purchase,N


In [69]:
one_file.iloc[ind2].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]

Unnamed: 0,4831933
review_body,I love this puzzle! It is great for a 20-month old. It's not too easy like some others I've seen but easy enough that my child does not get frustrated. The pictures are great and it helps with learning animal sounds and names.
review_headline,Great Peg Puzzle
product_title,Wooden Farm Hide & Seek 9-piece Peg Puzzle
star_rating,5
helpful_votes,3
total_votes,3
verified_purchase,N


## Getting Predictions

In [332]:
print('''Disney Animators' Collection Elsa Doll - 16"''')

Disney Animators' Collection Elsa Doll - 16"


In [398]:
review_body = "The plastic cars come in 8 colors provided in the product description. The are well sculpted for playing pieces. But, BUT the cars received are not the cars from the photo in the description. The cars I got are much more cartoonish in design. For a more accurate idea of what cars you will be getting, look up Bolide or Detroit-Cleveland Grand Prix games."
review_title = 'Nice cars. Wrong item pictured!'
product_title = "Plastic Race Car: Set of 16 Black, White, Red, Orange, Yellow, Green, Blue, and Purple Color Board Game Playing Pieces (Racing Car Tokens & Markers, Colored School Classroom Supplies, Arts & Crafts Projects, Teaching & Education Toy Vehicle Resource Components, Extra Instructional Play Materials)"
star_rating = 3
helpful_votes = 1
total_votes = 1

In [399]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,"The plastic cars come in 8 colors provided in the product description. The are well sculpted for playing pieces. But, BUT the cars received are not the cars from the photo in the description. The cars I got are much more cartoonish in design. For a more accurate idea of what cars you will be getting, look up Bolide or Detroit-Cleveland Grand Prix games.",Nice cars. Wrong item pictured!,"Plastic Race Car: Set of 16 Black, White, Red, Orange, Yellow, Green, Blue, and Purple Color Board Game Playing Pieces (Racing Car Tokens & Markers, Colored School Classroom Supplies, Arts & Crafts Projects, Teaching & Education Toy Vehicle Resource Components, Extra Instructional Play Materials)"


In [400]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,"The plastic cars come in 8 colors provided in the product description. The are well sculpted for playing pieces. But, BUT the cars received are not the cars from the photo in the description. The cars I got are much more cartoonish in design. For a more accurate idea of what cars you will be getting, look up Bolide or Detroit-Cleveland Grand Prix games.",Nice cars. Wrong item pictured!,"Plastic Race Car: Set of 16 Black, White, Red, Orange, Yellow, Green, Blue, and Purple Color Board Game Playing Pieces (Racing Car Tokens & Markers, Colored School Classroom Supplies, Arts & Crafts Projects, Teaching & Education Toy Vehicle Resource Components, Extra Instructional Play Materials)",plastic car come color provided product description well sculpted playing piece but car received car photo description car got much cartoonish design accurate idea car getting look bolide detroitcleveland grand prix game,nice car wrong item pictured,plastic race car set black white red orange yellow green blue purple color board game playing piece racing car token marker colored school classroom supply art craft project teaching education toy vehicle resource component extra instructional play material


In [401]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
prop = 1
help_prop_id = id_for_prop(prop)
prod_title_comp = only_compound(out['new_product_title'][0])
rev_title_comp = only_compound(out['new_review_title'][0])

In [402]:
# The one we had originally, and is producing the wrong order of proportions
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[ 0.4939,  3.    , -0.0772,  0.    ,  1.    ]])

In [253]:
# The right one !!
#rev_input_test = np.array([[rev_title_comp, star_rating, prod_title_comp, rev_bod_id, help_prop_id]])
#rev_input_test

array([[-0.1779,  5.    ,  0.6249,  0.    ,  1.    ]])

In [403]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

prediction

prediction[0]
#probabilities

'N'