In [1]:
# import libraries

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# Personal_Care_Appliances

In [5]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv("C:/Users/19495/DS3ProjectFiles/amazon_reviews_us_Electronics_v1_00.tsv.gz", names =  columns, sep = '\t').iloc[1:,:]

In [6]:
elec_df = elec_df.sample(n = 200000)

In [7]:
one_file = elec_df.copy()

## Data Sampling

In [8]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

In [9]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head(2))

Number of verified purchases: 168257
Number of unverified purchases: 31743
Number of verified purchases (balanced dataset): 31743
Number of unverified purchases (balanced dataset): 31743


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2134543,US,49366425,R142SXVYRBAGGX,B003VTZPO8,909704137,MEElectronics Sports Sound-Isolating In Ear Headphones with Microphone/Remote for iPod,Electronics,2,0,0,N,N,"Very Comfortable, horrible sound","Quick review...In a nutshell:<br /><br />Pros:<br />- Extremely comfortable earbuds; probably the most comfortable and best fitting I have ever worn. I wore them on a 2 hour hike, they did not fall out once, and I never had to adjust them for comfort.<br />- Cord Length seems to be longer than most which, for me, is a plus<br />- Sound Isolation - This is caused by the fit of the earbuds in your ears. For me, I heard virtually no outside noise<br /><br />Cons:<br />- Horrible sound quality. I am NOT an ausiophile...I usually use cheapy earbuds or earphones. But these were awful. Absolutely no bass response, and the highs were absent as well. Everything sounded timmy and the low end was muted. They are going back for this reason<br /><br />Hope this helps someone!!",2013-02-01
301172,US,11201045,R31OG0JWED2ABX,B005FA38SG,889377465,Underwater Audio Waterproof iPod Shuffle,Electronics,5,0,0,N,N,Five Stars,"Underwater Audio has been the greatest IPOD I have ever had. I have to do boring physical therapy exercises in the pool along with swimming laps. My IPOD work perfectly until recently. When I called the customer service center for help troubleshooting the IPOD I was stunned that they would send me a brand new one at no cost to me except shipping. I have ordered many things from many different companies and have never had such exceptional customer service. Lynzi was exceptional. While we were talking, we got disconnected and before I could call back she called me. The original IPOD I had was an engraved gift from my husband. The one they are replacing it with will also be engraved. Up until now I have told everyone how great my IPOD has been. I can't wait to tell people how wonderful their customer service has been to me....Thank You for making swimming and exercise at the pool a lot of fun. This company is diffidently 5 stars.",2015-05-29


In [10]:
print("The Beauty sample will have size:", balanced_elec.shape)

The Beauty sample will have size: (63486, 15)


## Data Cleaning & Type Conversion

In [11]:
#one_file = balanced_elec.copy()

In [12]:
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 168257
Number of unverified purchases (balanced dataset): 31743


In [13]:
# Star rating of the format '2015-04-02' exists! In addition, np.nan exists in the helpful and total votes
one_file['star_rating'].value_counts()

5    113073
4     33908
1     22540
3     15056
2     11214
5      2616
4       582
1       504
3       292
2       215
Name: star_rating, dtype: int64

In [14]:
# Convert the type of relevant columns to int
one_file['star_rating'] = (one_file['star_rating']).apply(lambda star: np.NaN if (len(str(star)) > 3) else star)
one_file['star_rating'] = one_file['star_rating'].apply(lambda star: star if (pd.isna(star)) else int(star))
one_file['helpful_votes'] = one_file['helpful_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(vote))
one_file['total_votes'] = one_file['total_votes'].apply(lambda vote: vote if (pd.isna(vote)) else int(int(vote)))

In [15]:
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\19495\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

In [17]:
cleaned2 = df_cleaning(one_file, 'review_body')
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
2134543,"Quick review...In a nutshell:<br /><br />Pros:<br />- Extremely comfortable earbuds; probably the most comfortable and best fitting I have ever worn. I wore them on a 2 hour hike, they did not fall out once, and I never had to adjust them for comfort.<br />- Cord Length seems to be longer than most which, for me, is a plus<br />- Sound Isolation - This is caused by the fit of the earbuds in your ears. For me, I heard virtually no outside noise<br /><br />Cons:<br />- Horrible sound quality. I am NOT an ausiophile...I usually use cheapy earbuds or earphones. But these were awful. Absolutely no bass response, and the highs were absent as well. Everything sounded timmy and the low end was muted. They are going back for this reason<br /><br />Hope this helps someone!!",quick reviewin nutshell pro extremely comfortable earbuds probably comfortable best fitting ever worn wore hour hike fall once never adjust comfort cord length seems longer which me plus sound isolation caused fit earbuds ear me heard virtually outside noise con horrible sound quality ausiophilei usually use cheapy earbuds earphone awful absolutely bass response high absent well everything sounded timmy low end muted going back reason hope help someone
2767949,"Just what I was looking for! Not available at any local retail stores (that I could find...), and works perfectly! Great price, too!",looking for available local retail store could find work perfectly great price too
908164,"Works perfect, exactly what we needed for the giant 80's style tv we inherited. We can use our Roku with absolutely no issues. We have been using it for about a year now.",work perfect exactly needed giant s style tv inherited use roku absolutely issue using year now
301172,"Underwater Audio has been the greatest IPOD I have ever had. I have to do boring physical therapy exercises in the pool along with swimming laps. My IPOD work perfectly until recently. When I called the customer service center for help troubleshooting the IPOD I was stunned that they would send me a brand new one at no cost to me except shipping. I have ordered many things from many different companies and have never had such exceptional customer service. Lynzi was exceptional. While we were talking, we got disconnected and before I could call back she called me. The original IPOD I had was an engraved gift from my husband. The one they are replacing it with will also be engraved. Up until now I have told everyone how great my IPOD has been. I can't wait to tell people how wonderful their customer service has been to me....Thank You for making swimming and exercise at the pool a lot of fun. This company is diffidently 5 stars.",underwater audio greatest ipod ever had boring physical therapy exercise pool along swimming lap ipod work perfectly recently called customer service center help troubleshooting ipod stunned would send new one cost except shipping ordered many thing many different company never exceptional customer service lynzi exceptional talking got disconnected could call back called me original ipod engraved gift husband one replacing also engraved told everyone great ipod been cant wait tell people wonderful customer service methank making swimming exercise pool lot fun company diffidently star
1061624,These do not fit over my adult size ears as advertized. They provide zero isolation of background noise. The overhead strap is poorly cushioned and begins to be uncomfortable in seconds. If you need any of those functions then keep looking.,fit adult size ear advertized provide zero isolation background noise overhead strap poorly cushioned begin uncomfortable second need function keep looking


In [18]:
cleaned2 = df_cleaning(one_file, 'review_headline')
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
2134543,"Very Comfortable, horrible sound",comfortable horrible sound
2767949,Tollink Selector Switch,tollink selector switch
908164,"Works perfect, exactly what we needed for the giant 80's ...",work perfect exactly needed giant s
301172,Five Stars,five star
1061624,Poor choice,poor choice


In [19]:
cleaned2 = df_cleaning(one_file, 'product_title')
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
2134543,MEElectronics Sports Sound-Isolating In Ear Headphones with Microphone/Remote for iPod,meelectronics sport soundisolating ear headphone microphoneremote ipod
2767949,C2G/Cables to Go 28734 Toslink Digital Audio Selector Switch,cgcables go toslink digital audio selector switch
908164,Generic Mini HD Video Converter Box HDMI to AV/CVBS L/R Video Adapter HDMI2AV Support NTSC and PAL Output,generic mini hd video converter box hdmi avcvbs lr video adapter hdmiav support ntsc pal output
301172,Underwater Audio Waterproof iPod Shuffle,underwater audio waterproof ipod shuffle
1061624,Nakamichi NK780 Over-The-Ear Headphones Black,nakamichi nk overtheear headphone black


In [20]:
print("Number of verified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(cleaned2[cleaned2['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 168247
Number of unverified purchases (balanced dataset): 31740


## Vader Sentiment Analysis

In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    return snt

In [22]:
one_file = cleaned2.copy()

In [23]:
# Add 4 new columns to cleaned_elec df: neg_prop, neu_prop, pos_prop, compound_prop
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [24]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [25]:
#one_file.to_csv('beauty_data_cleaned.csv')

In [26]:
one_file.head()
print("Number of verified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(one_file[one_file['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 168247
Number of unverified purchases (balanced dataset): 31740


## Confusion Matrix

In [27]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

## KNN

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [29]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [30]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

In [31]:
def helpful_prop(df):  
    vote_series = pd.Series(df['helpful_votes'] / df['total_votes'])
    # !! All nan values in votes_prop should be changed to zero: this means that 0/0 occurred !!
    vote_series = vote_series.fillna(0)
    return vote_series

In [32]:
# Helper function for id'ing the review body sentiments into -1, 0, 1 depending on majority sentiment
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
    
    # If at idx 1, neutral sent
    if ind == 1:
        return 0
    # If at idx 0, negative sent
    elif ind == 0:
        return -1
    else:
        return 1

In [33]:
# Helper function for id'ing the helper vote proportion
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [34]:
one_file['help_prop'] = helpful_prop(one_file)
one_file['rev_bod_id'] = one_file['rev_dict'].apply(id_for_dictionary)
one_file['help_prop_id'] = one_file["help_prop"].apply(id_for_prop)
one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,new_review_body,new_review_headline,new_product_title,rev_dict,neg_prop,neu_prop,pos_prop,compound_prop,prod_title_comp,rev_title_comp,help_prop,rev_bod_id,help_prop_id
2134543,US,49366425,R142SXVYRBAGGX,B003VTZPO8,909704137,MEElectronics Sports Sound-Isolating In Ear Headphones with Microphone/Remote for iPod,Electronics,2,0,0,N,N,"Very Comfortable, horrible sound","Quick review...In a nutshell:<br /><br />Pros:<br />- Extremely comfortable earbuds; probably the most comfortable and best fitting I have ever worn. I wore them on a 2 hour hike, they did not fall out once, and I never had to adjust them for comfort.<br />- Cord Length seems to be longer than most which, for me, is a plus<br />- Sound Isolation - This is caused by the fit of the earbuds in your ears. For me, I heard virtually no outside noise<br /><br />Cons:<br />- Horrible sound quality. I am NOT an ausiophile...I usually use cheapy earbuds or earphones. But these were awful. Absolutely no bass response, and the highs were absent as well. Everything sounded timmy and the low end was muted. They are going back for this reason<br /><br />Hope this helps someone!!",2013-02-01,quick reviewin nutshell pro extremely comfortable earbuds probably comfortable best fitting ever worn wore hour hike fall once never adjust comfort cord length seems longer which me plus sound isolation caused fit earbuds ear me heard virtually outside noise con horrible sound quality ausiophilei usually use cheapy earbuds earphone awful absolutely bass response high absent well everything sounded timmy low end muted going back reason hope help someone,comfortable horrible sound,meelectronics sport soundisolating ear headphone microphoneremote ipod,"{'neg': 0.17, 'neu': 0.598, 'pos': 0.232, 'compound': 0.7706}",0.17,0.598,0.232,0.7706,0.0,-0.0516,0.0,0,-1
2767949,US,17149562,R27GTDUSJEAECY,B0002OF2FC,679938792,C2G/Cables to Go 28734 Toslink Digital Audio Selector Switch,Electronics,5,0,0,N,Y,Tollink Selector Switch,"Just what I was looking for! Not available at any local retail stores (that I could find...), and works perfectly! Great price, too!",2010-01-30,looking for available local retail store could find work perfectly great price too,tollink selector switch,cgcables go toslink digital audio selector switch,"{'neg': 0.0, 'neu': 0.57, 'pos': 0.43, 'compound': 0.8519}",0.0,0.57,0.43,0.8519,0.0,0.0,0.0,0,-1
908164,US,22178693,R3NCYQ1MF62DAK,B008FO7PQA,222420710,Generic Mini HD Video Converter Box HDMI to AV/CVBS L/R Video Adapter HDMI2AV Support NTSC and PAL Output,Electronics,5,0,0,N,Y,"Works perfect, exactly what we needed for the giant 80's ...","Works perfect, exactly what we needed for the giant 80's style tv we inherited. We can use our Roku with absolutely no issues. We have been using it for about a year now.",2014-12-03,work perfect exactly needed giant s style tv inherited use roku absolutely issue using year now,work perfect exactly needed giant s,generic mini hd video converter box hdmi avcvbs lr video adapter hdmiav support ntsc pal output,"{'neg': 0.0, 'neu': 0.802, 'pos': 0.198, 'compound': 0.5719}",0.0,0.802,0.198,0.5719,0.4019,0.5719,0.0,0,-1
301172,US,11201045,R31OG0JWED2ABX,B005FA38SG,889377465,Underwater Audio Waterproof iPod Shuffle,Electronics,5,0,0,N,N,Five Stars,"Underwater Audio has been the greatest IPOD I have ever had. I have to do boring physical therapy exercises in the pool along with swimming laps. My IPOD work perfectly until recently. When I called the customer service center for help troubleshooting the IPOD I was stunned that they would send me a brand new one at no cost to me except shipping. I have ordered many things from many different companies and have never had such exceptional customer service. Lynzi was exceptional. While we were talking, we got disconnected and before I could call back she called me. The original IPOD I had was an engraved gift from my husband. The one they are replacing it with will also be engraved. Up until now I have told everyone how great my IPOD has been. I can't wait to tell people how wonderful their customer service has been to me....Thank You for making swimming and exercise at the pool a lot of fun. This company is diffidently 5 stars.",2015-05-29,underwater audio greatest ipod ever had boring physical therapy exercise pool along swimming lap ipod work perfectly recently called customer service center help troubleshooting ipod stunned would send new one cost except shipping ordered many thing many different company never exceptional customer service lynzi exceptional talking got disconnected could call back called me original ipod engraved gift husband one replacing also engraved told everyone great ipod been cant wait tell people wonderful customer service methank making swimming exercise pool lot fun company diffidently star,five star,underwater audio waterproof ipod shuffle,"{'neg': 0.035, 'neu': 0.69, 'pos': 0.275, 'compound': 0.9786}",0.035,0.69,0.275,0.9786,0.0,0.0,0.0,0,-1
1061624,US,36379732,RG3VBNUFUWM2J,B00BAGDPFA,898695577,Nakamichi NK780 Over-The-Ear Headphones Black,Electronics,2,0,0,N,N,Poor choice,These do not fit over my adult size ears as advertized. They provide zero isolation of background noise. The overhead strap is poorly cushioned and begins to be uncomfortable in seconds. If you need any of those functions then keep looking.,2014-10-05,fit adult size ear advertized provide zero isolation background noise overhead strap poorly cushioned begin uncomfortable second need function keep looking,poor choice,nakamichi nk overtheear headphone black,"{'neg': 0.205, 'neu': 0.698, 'pos': 0.097, 'compound': -0.4215}",0.205,0.698,0.097,-0.4215,0.0,-0.4767,0.0,0,-1


In [35]:
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [36]:
imp_col = imp_col.dropna()

In [37]:
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

In [38]:
#try_list = [-0.56514068,  0.62580226,  0.90751079, -0.57225657, -0.7032493]
#print(X[0][0])
#print(try_list[0])
#X.shape

#try_list = [-0.56514068,  0.62580226,  0.90751079, -0.57225657, -0.7032493]
#for array in X:
#    print(X[0])
#    print(try_list[0])
#    if X[0] == try_list[0]:
#        print(x)

In [39]:
print("Number of verified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'Y']))
print("Number of unverified purchases (balanced dataset):", len(imp_col[imp_col['verified_purchase'] == 'N']))

Number of verified purchases (balanced dataset): 168247
Number of unverified purchases (balanced dataset): 31740


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [41]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=1,
                     weights='uniform')

In [42]:
y_pred = knn_classifier.predict(X_test)

In [43]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [44]:
print(cm)
print(ac)

[[  190  6185]
 [  335 33288]]
0.8369918495924796


## Prediction

In [152]:
review_body = "This mp3 player is very good and the sound quality is amazing. The one problem i had was the headphone jack was non functional so i sent it to the factory which its still under warranty but i had to pay $40 for labor and its been like 2 weeks and i still havent got it back."
review_title = 'Good but one problem'
product_title = "Creative Zen Touch 20 GB MP3 Player"
star_rating = 4
helpful_votes = 10
total_votes = 11

In [153]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,This mp3 player is very good and the sound quality is amazing. The one problem i had was the headphone jack was non functional so i sent it to the factory which its still under warranty but i had to pay $40 for labor and its been like 2 weeks and i still havent got it back.,Good but one problem,Creative Zen Touch 20 GB MP3 Player


In [154]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,This mp3 player is very good and the sound quality is amazing. The one problem i had was the headphone jack was non functional so i sent it to the factory which its still under warranty but i had to pay $40 for labor and its been like 2 weeks and i still havent got it back.,Good but one problem,Creative Zen Touch 20 GB MP3 Player,mp player good sound quality amazing one problem headphone jack non functional sent factory still warranty pay labor like week still havent got back,good one problem,creative zen touch gb mp player


In [155]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
prop = 10/11
help_prop_id = id_for_prop(prop)
prod_title_comp = only_compound(out['new_product_title'][0])
rev_title_comp = only_compound(out['new_review_title'][0])

In [156]:
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[0.4404, 4.    , 0.0516, 0.    , 1.    ]])

In [157]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

In [158]:
prediction

array(['N'], dtype=object)

In [160]:
probabilities

array([0.6, 0.4])

In [54]:
def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3), round(proba[1], 3)]
    if prediction[0] == 'Y':
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]*100}% probability of being VERIFIED and {proba[0]*100}% probability of being UNVERIFIED')
    if prediction[0] == 'N':
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]*100}% probability of being UNVERIFIED and {proba[1]*100}% probability of being VERIFIED')
        
interpret_prediction(review_body, prediction, probabilities)

"I have had trouble with these refills. I've had my braun toothbrush for over 5 years, but find that these Oral-B Precision Clean refill brushes become loose and wobbly after about a month. This happened with my last order also; however I did not remember where I got it. Up to this time I've had no trouble with my toothbrush or the replacement heads." is predicted to be a VERIFIED review, with 70.0% probability of being VERIFIED and 30.0% probability of being UNVERIFIED


In [161]:
from joblib import dump, load

name = 'knn_electronics_thousands_model.joblib'
dump(knn_classifier, name)
knn_classifier = load(name)

## Getting Reviews

In [103]:
ind1 = list((y_test == y_pred) & (y_test == 'N'))#.index(True)
#  & (y_test == 'N')
[i for i, x in enumerate(ind1) if x]

[98,
 249,
 319,
 391,
 1014,
 1268,
 1743,
 2050,
 2277,
 2291,
 2777,
 2972,
 2989,
 3133,
 3533,
 3559,
 3570,
 3588,
 3592,
 3703,
 4433,
 4598,
 4677,
 4861,
 5083,
 5089,
 5750,
 6424,
 6708,
 6786,
 6990,
 7057,
 7095,
 7237,
 7362,
 7430,
 7516,
 8129,
 8149,
 8213,
 8401,
 8402,
 8574,
 8595,
 8601,
 8804,
 8840,
 8853,
 8879,
 8884,
 8950,
 9098,
 9101,
 9672,
 9746,
 9931,
 10051,
 10098,
 10248,
 10498,
 11259,
 11487,
 11621,
 11906,
 12451,
 12833,
 12850,
 13340,
 13394,
 13496,
 13503,
 13545,
 13589,
 14159,
 14286,
 14387,
 14608,
 15155,
 15187,
 15367,
 15616,
 15910,
 15930,
 16997,
 17082,
 17148,
 17178,
 17193,
 17333,
 17703,
 17920,
 18067,
 18540,
 18623,
 18812,
 18975,
 19217,
 19253,
 19433,
 19711,
 20232,
 20484,
 21049,
 21234,
 21656,
 21743,
 21751,
 21853,
 21972,
 21998,
 22428,
 22659,
 23123,
 23312,
 23384,
 23916,
 24422,
 24561,
 24950,
 25145,
 25459,
 25634,
 25654,
 25890,
 25904,
 26162,
 26239,
 26247,
 26322,
 26371,
 26441,
 27152,
 2724

In [146]:
ind1 = 2050
X_test[ind1]

array([0.4404, 4.    , 0.0516, 0.    , 1.    ])

In [147]:
test = list(X)
test;

In [148]:
test2 = (test == X_test[ind1])
test2

array([[False, False, False,  True, False],
       [False, False, False,  True, False],
       [False, False, False,  True, False],
       ...,
       [False, False, False, False,  True],
       [False,  True, False, False, False],
       [False, False, False,  True,  True]])

In [149]:
ind2 = list(np.all(test2, axis = 1))#.index(True)
ind2
n_lst = [i for i, x in enumerate(ind2) if x]
(n_lst)

[139317]

In [151]:
one_file.iloc[139317].to_frame().loc[['review_body', 'review_headline', 'product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase']]

Unnamed: 0,3047533
review_body,This mp3 player is very good and the sound quality is amazing. The one problem i had was the headphone jack was non functional so i sent it to the factory which its still under warranty but i had to pay $40 for labor and its been like 2 weeks and i still havent got it back.
review_headline,Good but one problem
product_title,Creative Zen Touch 20 GB MP3 Player
star_rating,4
helpful_votes,10
total_votes,11
verified_purchase,N
