In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# Personal_Care_Appliances

In [5]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv("C:/Users/19495/DS3ProjectFiles/amazon_reviews_us_Beauty_v1_00.tsv.gz", names =  columns, sep = '\t').iloc[1:,:]

In [6]:
elec_df = elec_df.sample(n = 1_000_000)

In [7]:
one_file = elec_df.copy()

## Data Sampling

In [8]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [9]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 826088
Number of unverified purchases: 173902
Number of verified purchases (balanced dataset): 173902
Number of unverified purchases (balanced dataset): 173902


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2950239,US,14732211,R1SCZOWSQROU27,B00HSNWZKU,76905086,"Clear Men 2 in 1 Shampoo + Conditioner, Ocean Mineral Hydration 12.9 oz",Beauty,5,0,0,Y,N,from a guy with dandruff...,"So I have horrible winter dandruff, every year since I was a kid. Full on flake fest. Hate it but its true.<br />Needless to say I have tried many many products over the years and am always willing to try something new.<br />Truth is most products are a trade off, usually the tea-tree shampoos work by simply making my flakes smaller so they are still there but not as noticeable. The Head and shoulders type products all use the same Pyrithione zinc (1%) as an active ingredient, and you will find it here to. And the Nuke-em product of the dandruff world TAR. (yep I said it tar.) which works in extreme cases but smells like ...well tar..and leaves you with no flakes but hair that hates you.<br /><br />The regular everyday products seem to help but do note eliminate my dandruff, and worst off they seem to leave my hair scorched and dried out over time (anything more than a week.) I find myself alternating between a dandruff shampoo and a regular shampoo in an effort to have normal looking hair without the dandruff. It works maybe 6 months out of the year...then the cold season sets in and forget it...flake city.<br /><br />This time it was different. This shampoo works on all fronts. I found my hair was great even after 2 weeks of daily use...no need to switch back and forth to avoid dried out look, and the flakes were gone....like totally gone. Ok so I am surprised an perplexed but happy. It has the same active ingredient to start with but whatever comes next seems to work...<br /><br />A great product that I will be buying regularly.",2014-05-15
1000316,US,12171186,R1VIR720NP8PMO,B008EL6E4I,53992228,One 'N Only Argan Oil Hair Color 9G Very Light Golden Blonde,Beauty,1,1,1,N,N,not impressed,Going back to Age Beautiful. I used 5n and the color did not last past 4 washings. fades unevenly,2015-04-16


In [10]:
print("The Beauty sample will have size:", balanced_elec.shape)

The Beauty sample will have size: (347804, 15)


## Data Cleaning & Type Conversion

In [11]:
#one_file = balanced_elec.copy()

In [12]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826088
Number of unverified purchases (balanced dataset): 173902


In [13]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5             509102
5             126805
4             118984
1              73080
3.0            63949
2              42196
4              26219
1              16280
3              14230
2               9145
2015-04-09         1
2015-04-14         1
2015-03-30         1
2014-10-09         1
2015-07-22         1
Name: star_rating, dtype: int64

In [14]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [15]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [17]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
1785852,"Really effective, I can't live without this product, I use it all the time. It leaves my hair soft and silky!",really effective cant live without product use time leaf hair soft silky
627621,"Absolutely Perfect! The are huge, every brush has different density and fiber. I own quite a few high(est) end Chikuhodo, Koyudo, ect, but I have not even reached for the beloved Fu-Pa since these beauties came home. Simple: synthetic bristles are designed to work with cream product.<br /> The HUGE flat kabuki F17 blends RMS's UnCover Up & W3LL People's Narcissist in just a few sweeps, keeping product waste to a bare minimum & leaves flawless streak free finish, even beats the Rae Morris Radiance. The slanted kabuki is amazing at diffusing Ilia or Vapour Multi-Use Stick blush/stains. The more dense F13 works wonders on contouring the cheekbones, temple & hairline areas, as it can be shaped into a thinner flat end bcz of the densely packed bristles. F14 good for packing some mineral powder on if needed for extra SUV protection or just quick sweep of finishing powder with the big fluffy soft F15 (natural hair) & not a worry in the world about ruining any of them when you toss them in the make up bag: $20!<br /> I don't use much product, so I wash them once a week. Not a single hair loss, shape & feel has retained just as new as well. Fantastic Make!",absolutely perfect huge every ush different density fiber quite highest end chikuhodo koyudo ect even reached beloved fupa since beauty came home simple synthetic istles designed work cream product huge flat kabuki f blend rmss uncover wll people narcissist sweep keeping product waste bare minimum leaf flawless streak free finish even beat rae morris radiance slanted kabuki amazing diffusing ilium vapour multiuse stick blushstains dense f work wonder contouring cheekbone temple hairline area shaped thinner flat end bcz densely packed istles f good packing mineral powder needed extra suv protection quick sweep finishing powder big fluffy soft f natural hair worry world ruining toss make bag use much product wash week single hair loss shape feel retained new well fantastic make
1577579,I have used this product for years and love it. Easy to apply and no stinky chemical smell. No ammonia so it's not had on the hair. If you call the toll free number on the back they will help you pick a color.,used product year love it easy apply stinky chemical smell ammonia hair call toll free number back help pick color
1985088,Awesome shaving cream cup for the price,awesome shaving cream cup price
2950239,"So I have horrible winter dandruff, every year since I was a kid. Full on flake fest. Hate it but its true.<br />Needless to say I have tried many many products over the years and am always willing to try something new.<br />Truth is most products are a trade off, usually the tea-tree shampoos work by simply making my flakes smaller so they are still there but not as noticeable. The Head and shoulders type products all use the same Pyrithione zinc (1%) as an active ingredient, and you will find it here to. And the Nuke-em product of the dandruff world TAR. (yep I said it tar.) which works in extreme cases but smells like ...well tar..and leaves you with no flakes but hair that hates you.<br /><br />The regular everyday products seem to help but do note eliminate my dandruff, and worst off they seem to leave my hair scorched and dried out over time (anything more than a week.) I find myself alternating between a dandruff shampoo and a regular shampoo in an effort to have normal looking hair without the dandruff. It works maybe 6 months out of the year...then the cold season sets in and forget it...flake city.<br /><br />This time it was different. This shampoo works on all fronts. I found my hair was great even after 2 weeks of daily use...no need to switch back and forth to avoid dried out look, and the flakes were gone....like totally gone. Ok so I am surprised an perplexed but happy. It has the same active ingredient to start with but whatever comes next seems to work...<br /><br />A great product that I will be buying regularly.",horrible winter dandruff every year since kid full flake fest hate true needle say tried many many product year always willing try something new truth product trade off usually teatree shampoo work simply making flake smaller still noticeable head shoulder type product use pyrithione zinc active ingredient find to nukeem product dandruff world tar yep said tar work extreme case smell like well tarand leaf flake hair hate you regular everyday product seem help note eliminate dandruff worst seem leave hair scorched dried time anything week find alternating dandruff shampoo regular shampoo effort normal looking hair without dandruff work maybe month yearthen cold season set forget itflake city time different shampoo work front found hair great even week daily useno need switch back forth avoid dried look flake gonelike totally gone ok surprised perplexed happy active ingredient start whatever come next seems work great product buying regularly


In [18]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
1785852,"Really effective, I can't live without this product, ...",really effective cant live without product
627621,Absolutely Perfect!,absolutely perfect
1577579,I have used this product for years and love it. Easy to apply and no stinky chemical ...,used product year love it easy apply stinky chemical
1985088,Five Stars,five star
2950239,from a guy with dandruff...,guy dandruff


In [19]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
1785852,"Matrix Total Results Repair Break Fix Leave-In Elixir, 6.5 Ounce",matrix total result repair break fix leavein elixir ounce
627621,"SHANY Pro 5 Piece Essential Kabuki Brush Set Synthetic and Natural Hair, X-Large",shany pro piece essential kabuki brush set synthetic natural hair xlarge
1577579,"L'Oreal Paris Healthy Look Creme Gloss Color, Medium Red Brown/Cherry Truffle 5R (Pack of 3)",loreal paris healthy look creme gloss color medium red browncherry truffle r pack
1985088,Mens Durable Shave Soap Cup Shinning Stainless Steel Shaving Mug Bowl,men durable shave soap cup shinning stainless steel shaving mug bowl
2950239,"Clear Men 2 in 1 Shampoo + Conditioner, Ocean Mineral Hydration 12.9 oz",clear men shampoo conditioner ocean mineral hydration oz


In [20]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826010
Number of unverified purchases (balanced dataset): 173879


## Vader Sentiment Analysis

In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [22]:
one_file = cleaned2.copy()

In [23]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [24]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [25]:
#one_file.to_csv('beauty_data_cleaned.csv')

In [26]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826010
Number of unverified purchases (balanced dataset): 173879


## Confusion Matrix

In [27]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [29]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [30]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [31]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [32]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [33]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [34]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
1785852,US,2018862,R16QNZ503CLS2C,B007Z1YEJK,131561176,"Matrix Total Results Repair Break Fix Leave-In Elixir, 6.5 Ounce",Beauty,5.0,1.0,1.0,N,Y,"Really effective, I can't live without this product, ...","Really effective, I can't live without this product, I use it all the time. It leaves my hair soft and silky!",2015-01-03,really effective cant live without product use time leaf hair soft silky,really effective cant live without product,matrix total result repair break fix leavein elixir ounce,"{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'compound': 0.5256}",0.0,0.764,0.236,0.5256,0.0,0.5256,1.0,0,1
627621,US,47494058,R1V384BAFUBQ98,B00JGN1NHQ,611387097,"SHANY Pro 5 Piece Essential Kabuki Brush Set Synthetic and Natural Hair, X-Large",Beauty,5.0,8.0,8.0,N,Y,Absolutely Perfect!,"Absolutely Perfect! The are huge, every brush has different density and fiber. I own quite a few high(est) end Chikuhodo, Koyudo, ect, but I have not even reached for the beloved Fu-Pa since these beauties came home. Simple: synthetic bristles are designed to work with cream product.<br /> The HUGE flat kabuki F17 blends RMS's UnCover Up & W3LL People's Narcissist in just a few sweeps, keeping product waste to a bare minimum & leaves flawless streak free finish, even beats the Rae Morris Radiance. The slanted kabuki is amazing at diffusing Ilia or Vapour Multi-Use Stick blush/stains. The more dense F13 works wonders on contouring the cheekbones, temple & hairline areas, as it can be shaped into a thinner flat end bcz of the densely packed bristles. F14 good for packing some mineral powder on if needed for extra SUV protection or just quick sweep of finishing powder with the big fluffy soft F15 (natural hair) & not a worry in the world about ruining any of them when you toss them in the make up bag: $20!<br /> I don't use much product, so I wash them once a week. Not a single hair loss, shape & feel has retained just as new as well. Fantastic Make!",2015-06-07,absolutely perfect huge every ush different density fiber quite highest end chikuhodo koyudo ect even reached beloved fupa since beauty came home simple synthetic istles designed work cream product huge flat kabuki f blend rmss uncover wll people narcissist sweep keeping product waste bare minimum leaf flawless streak free finish even beat rae morris radiance slanted kabuki amazing diffusing ilium vapour multiuse stick blushstains dense f work wonder contouring cheekbone temple hairline area shaped thinner flat end bcz densely packed istles f good packing mineral powder needed extra suv protection quick sweep finishing powder big fluffy soft f natural hair worry world ruining toss make bag use much product wash week single hair loss shape feel retained new well fantastic make,absolutely perfect,shany pro piece essential kabuki brush set synthetic natural hair xlarge,"{'neg': 0.065, 'neu': 0.661, 'pos': 0.274, 'compound': 0.984}",0.065,0.661,0.274,0.984,0.3612,0.6115,1.0,0,1
1577579,US,18270838,R1DWRS3O15KFYX,B00GN0JYI0,831574578,"L'Oreal Paris Healthy Look Creme Gloss Color, Medium Red Brown/Cherry Truffle 5R (Pack of 3)",Beauty,5.0,0.0,0.0,N,Y,I have used this product for years and love it. Easy to apply and no stinky chemical ...,I have used this product for years and love it. Easy to apply and no stinky chemical smell. No ammonia so it's not had on the hair. If you call the toll free number on the back they will help you pick a color.,2015-02-01,used product year love it easy apply stinky chemical smell ammonia hair call toll free number back help pick color,used product year love it easy apply stinky chemical,loreal paris healthy look creme gloss color medium red browncherry truffle r pack,"{'neg': 0.081, 'neu': 0.453, 'pos': 0.466, 'compound': 0.8979}",0.081,0.453,0.466,0.8979,0.4019,0.6808,0.0,1,-1
1985088,US,50074274,R3862328N81KI6,B00F0BK2BQ,319821354,Mens Durable Shave Soap Cup Shinning Stainless Steel Shaving Mug Bowl,Beauty,5.0,0.0,0.0,N,Y,Five Stars,Awesome shaving cream cup for the price,2014-12-03,awesome shaving cream cup price,five star,men durable shave soap cup shinning stainless steel shaving mug bowl,"{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6249}",0.0,0.494,0.506,0.6249,0.0,0.0,0.0,1,-1
2950239,US,14732211,R1SCZOWSQROU27,B00HSNWZKU,76905086,"Clear Men 2 in 1 Shampoo + Conditioner, Ocean Mineral Hydration 12.9 oz",Beauty,5.0,0.0,0.0,Y,N,from a guy with dandruff...,"So I have horrible winter dandruff, every year since I was a kid. Full on flake fest. Hate it but its true.<br />Needless to say I have tried many many products over the years and am always willing to try something new.<br />Truth is most products are a trade off, usually the tea-tree shampoos work by simply making my flakes smaller so they are still there but not as noticeable. The Head and shoulders type products all use the same Pyrithione zinc (1%) as an active ingredient, and you will find it here to. And the Nuke-em product of the dandruff world TAR. (yep I said it tar.) which works in extreme cases but smells like ...well tar..and leaves you with no flakes but hair that hates you.<br /><br />The regular everyday products seem to help but do note eliminate my dandruff, and worst off they seem to leave my hair scorched and dried out over time (anything more than a week.) I find myself alternating between a dandruff shampoo and a regular shampoo in an effort to have normal looking hair without the dandruff. It works maybe 6 months out of the year...then the cold season sets in and forget it...flake city.<br /><br />This time it was different. This shampoo works on all fronts. I found my hair was great even after 2 weeks of daily use...no need to switch back and forth to avoid dried out look, and the flakes were gone....like totally gone. Ok so I am surprised an perplexed but happy. It has the same active ingredient to start with but whatever comes next seems to work...<br /><br />A great product that I will be buying regularly.",2014-05-15,horrible winter dandruff every year since kid full flake fest hate true needle say tried many many product year always willing try something new truth product trade off usually teatree shampoo work simply making flake smaller still noticeable head shoulder type product use pyrithione zinc active ingredient find to nukeem product dandruff world tar yep said tar work extreme case smell like well tarand leaf flake hair hate you regular everyday product seem help note eliminate dandruff worst seem leave hair scorched dried time anything week find alternating dandruff shampoo regular shampoo effort normal looking hair without dandruff work maybe month yearthen cold season set forget itflake city time different shampoo work front found hair great even week daily useno need switch back forth avoid dried look flake gonelike totally gone ok surprised perplexed happy active ingredient start whatever come next seems work great product buying regularly,guy dandruff,clear men shampoo conditioner ocean mineral hydration oz,"{'neg': 0.122, 'neu': 0.68, 'pos': 0.199, 'compound': 0.9217}",0.122,0.68,0.199,0.9217,0.3818,0.0,0.0,0,-1


In [35]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating          float64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [36]:
imp_col = imp_col.dropna()

In [37]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [38]:
#try_list = [-0.56514068,  0.62580226,  0.90751079, -0.57225657, -0.7032493]
#print(X[0][0])
#print(try_list[0])
#X.shape

#try_list = [-0.56514068,  0.62580226,  0.90751079, -0.57225657, -0.7032493]
#for array in X:
#    print(X[0])
#    print(try_list[0])
#    if X[0] == try_list[0]:
#        print(x)

In [39]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 826010
Number of unverified purchases (balanced dataset): 173879


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [41]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=1,
                     weights='uniform')

In [42]:
y_pred = knn_classifier.predict(X_test)

In [43]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [44]:
print(cm)
print(ac)

[[   687  34071]
 [  1245 163975]]
0.8234005740631469


## Prediction

In [164]:
review_body = "I have had trouble with these refills. I've had my braun toothbrush for over 5 years, but find that these Oral-B Precision Clean refill brushes become loose and wobbly after about a month. This happened with my last order also; however I did not remember where I got it. Up to this time I've had no trouble with my toothbrush or the replacement heads."
review_title = 'refill not good'
product_title = "Oral B Precision Clean Braun Replacement Brush Head Refill"
star_rating = 1
helpful_votes = 1
total_votes = 2

In [165]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,"I have had trouble with these refills. I've had my braun toothbrush for over 5 years, but find that these Oral-B Precision Clean refill brushes become loose and wobbly after about a month. This happened with my last order also; however I did not remember where I got it. Up to this time I've had no trouble with my toothbrush or the replacement heads.",refill not good,Oral B Precision Clean Braun Replacement Brush Head Refill


In [166]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,"I have had trouble with these refills. I've had my braun toothbrush for over 5 years, but find that these Oral-B Precision Clean refill brushes become loose and wobbly after about a month. This happened with my last order also; however I did not remember where I got it. Up to this time I've had no trouble with my toothbrush or the replacement heads.",refill not good,Oral B Precision Clean Braun Replacement Brush Head Refill,trouble refill ive aun tooth ush year find oralb precision clean refill ushes become loose wobbly month happened last order also however remember got it time ive trouble tooth ush replacement head,refill good,oral b precision clean braun replacement brush head refill


In [167]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
prop = 1/2
help_prop_id = id_for_prop(prop)
prod_title_comp = only_compound(out['new_product_title'][0])
rev_title_comp = only_compound(out['new_review_title'][0])

In [168]:
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[0.4019, 1.    , 0.4404, 0.    , 0.    ]])

In [169]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

In [170]:
prediction

array(['N'], dtype=object)

In [171]:
prediction[0]

'N'

In [172]:
probabilities

array([0.5, 0.5])

In [54]:
def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3), round(proba[1], 3)]
    if prediction[0] == 'Y':
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]*100}% probability of being VERIFIED and {proba[0]*100}% probability of being UNVERIFIED')
    if prediction[0] == 'N':
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]*100}% probability of being UNVERIFIED and {proba[1]*100}% probability of being VERIFIED')
        
interpret_prediction(review_body, prediction, probabilities)

"This is an all natural product with many helpful uses. Can be used as a hair treatment,  on the skin, body and nails.  I use it on my hands and feet and they seem softer. A great product with loads of essential oils.<br />**Product provided by the company**" is predicted to be a VERIFIED review, with 85.0% probability of being VERIFIED and 15.0% probability of being UNVERIFIED


In [55]:
from joblib import dump, load

name = 'knn_beauty_model.joblib'
dump(knn_classifier, name)
knn_classifier = load(name)

## Getting Reviews

In [111]:
ind1 = list((y_test == y_pred) & (y_test == 'N'))#.index(True)
[i for i, x in enumerate(ind1) if x]

[178,
 762,
 1613,
 1888,
 1908,
 2063,
 2154,
 2558,
 3569,
 4230,
 4552,
 4883,
 4892,
 6385,
 6403,
 6748,
 7521,
 7623,
 7844,
 7881,
 8121,
 8264,
 8935,
 9195,
 9828,
 10507,
 10717,
 10929,
 11145,
 11325,
 11413,
 11585,
 11639,
 11712,
 11717,
 11862,
 12706,
 12950,
 13004,
 13091,
 13847,
 14100,
 14405,
 14679,
 15404,
 15661,
 15874,
 16389,
 16734,
 16966,
 17078,
 17495,
 17736,
 17853,
 17993,
 18145,
 18326,
 18482,
 18795,
 18859,
 18972,
 19014,
 19017,
 19324,
 20116,
 20216,
 20246,
 20519,
 21394,
 21404,
 21513,
 21808,
 21913,
 22019,
 22146,
 23276,
 23421,
 23575,
 23890,
 24471,
 24520,
 24624,
 25013,
 25146,
 25174,
 25196,
 25860,
 26026,
 26154,
 26360,
 26435,
 26591,
 26680,
 27295,
 28163,
 28661,
 29276,
 29472,
 29481,
 29661,
 29771,
 30340,
 30377,
 30805,
 31060,
 31220,
 32134,
 32185,
 32507,
 32524,
 33121,
 33141,
 33660,
 33700,
 33815,
 33970,
 34127,
 34209,
 34438,
 34947,
 35259,
 35416,
 35654,
 36032,
 36768,
 36813,
 36891,
 36892,
 36

In [150]:
ind1 = 2063
X_test[ind1]

array([0.9001, 5.    , 0.5859, 0.    , 1.    ])

In [151]:
test = list(X)
test;

In [152]:
test2 = (test == X_test[ind1])
test2

array([[False,  True, False,  True,  True],
       [False,  True, False,  True,  True],
       [False,  True, False, False, False],
       ...,
       [False, False, False,  True, False],
       [False, False, False,  True,  True],
       [False, False, False,  True,  True]])

In [153]:
ind2 = list(np.all(test2, axis = 1))#.index(True)
ind2
n_lst = [i for i, x in enumerate(ind2) if x]
(n_lst)

[775472]

In [154]:
one_file.iloc[775472].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]

Unnamed: 0,2437485
review_body,Was recommended this product through a friend. She gave me sample from the bottle that she purchased. Started using the product and I saw results within a few days! Definetely will continue to use bFortuna Gold Allure as my skin care product. AMAZING product!!
review_headline,Amazing Product!
product_title,bFortuna Gold Allure - The GOLD STANDARD Vitamin C serum VEGAN CERTIFIED Vitamin C + Vitamin E + Hyaluronic Acid + Amino Complex serum. Anti-Aging serum to help improve the appearance of fine lines and wrinkles. Gold Allure's secret formula is The ONLY serum known to be used by DOCTORS due to the quantity and QUALITY of each ingredient that Gold Allure contains
star_rating,5
helpful_votes,2
total_votes,3
verified_purchase,N
