In [1]:
# import libraries

'''
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
'''

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# These are all the imports needed for the assignment
%matplotlib inline

# Import nltk package (Natural Language Toolkit)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [2]:
import pyspark

In [3]:
import dask.dataframe as dd

In [4]:
# Download the NLTK English tokenizer and the stopwords of all languages
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
files = ['amazon_reviews_us_Electronics_v1_00.tsv', \
         'amazon_reviews_us_Gift_Card_v1_00.tsv', \
         'amazon_reviews_us_Major_Appliances_v1_00.tsv', \
         'amazon_reviews_us_Office_Products_v1_00.tsv', \
         'amazon_reviews_us_Shoes_v1_00.tsv', \
         'amazon_reviews_us_Toys_v1_00.tsv', \
         'amazon_reviews_us_Watches_v1_00.tsv']

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [7]:
columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', \
           'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
elec_df = pd.read_csv('fars_data/' + files[0], names = columns, sep = '\t').iloc[1:,:]
print(elec_df.shape)

(1440997, 15)


In [8]:
elec_df = elec_df.sample(n = 500000)

columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
len(columns)

one_file = pd.read_csv('fars_data/' + files[0], names = columns, sep = '\t').iloc[1:,:]
one_file.head()

one_file.shape[0]+1

In [9]:
one_file = elec_df

In [10]:
def df_sampling(df):
    # Since we know that there are more unverified than verified --> we sample based on that
    
    # Since there are no data values in 'verified_purchase' columns that deviate from 'Y' or 'N' we proceed
    verified_count_df = df[df['verified_purchase'] == 'Y']
    unverified_count_df = df[df['verified_purchase'] == 'N']
    
    print("Number of verified purchases:", len(verified_count_df))
    print("Number of unverified purchases:", len(unverified_count_df))
    
    sample_len = len(unverified_count_df)
    
    verified_sample_df = verified_count_df.sample(n = sample_len)
    unified_df = pd.concat([unverified_count_df, verified_sample_df])
    
    print("Number of verified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'Y']))
    print("Number of unverified purchases (balanced dataset):", len(unified_df[unified_df['verified_purchase'] == 'N']))
    
    return unified_df

#test.rename(columns=test.iloc[0])
training_data = pd.DataFrame() #initialize an empty dataframe
testing_data = pd.DataFrame()

one_file = pd.read_csv(data_location + files[1], names = columns, sep = '\t').iloc[1:,:]

In [11]:
one_file.shape

(500000, 15)

In [12]:
balanced_elec = df_sampling(one_file)
display(balanced_elec.head())

Number of verified purchases: 451717
Number of unverified purchases: 48283
Number of verified purchases (balanced dataset): 48283
Number of unverified purchases (balanced dataset): 48283


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1172954,US,42880325,R38A1F4H4LXAEI,B0035B4LJM,301580759,Adapter HDMI Female to DVI Male Video Adapter,Electronics,3,0,0,N,N,Three Stars,IT S OKAY,2014-08-22
261445,US,29275858,RYGONWHK35H3Q,B001TIG36C,329010926,VideoSecu Mounts Tilt TV Wall Mount,Electronics,5,0,0,N,N,Good product,Super like this product. It was easy to install n looks great after mounting my tv.,2015-06-11
499665,US,50675599,R20N1WIFACWYCN,B00TEIIAZQ,955614152,Sennheiser Sports Earbud Neckband Headset Apple Devices,Electronics,2,0,2,Y,N,I Dare you To Find Better (at this point you can) updated 7/20/15,"Finally. We all workout right? These are the first pair of headphones that actually stay put no matter how vigorous one may exercise. The are snug in the ear while not being uncomfortable. As far as sound goes, like another reviewer stated, I have never owned a pair of inexpensive headphones that even come close to sounding as good as these do. The bass and higher notes are even and if you are listening to an audio recording where you want to hear every note this will not disappoint.<br /><br />The microphone after making several calls, I have been told I sound crystal clear. Another reviewer stated he had problems with Apple and since I own a Galaxy I can not make comment as to how they perform in the I-World. I also appreciate the color of the cord as it stands out and is easy to find a drawer of black cords. You can not go wrong buying these. 5 Stars with no reservations.<br /><br />Right earpiece blew out, followed a few seconds later by the left earpiece. Was not playing anything at a loud level, heard a pop and in the trash they went. They were great while they lasted, but for 100 bucks I expect more than 3 months of use. 5 stars to 2 stars, the only reason they aren't one is because when they worked they were by far my favorite pair I have ever owned.",2015-03-24
1219720,US,21830430,R1F7JISVKT6UV1,B003L1ZYYM,617978254,"AmazonBasics High-Speed HDMI Cable - 6.5 Feet (2 Meters) Supports Ethernet, 3D, 4K and Audio Return",Electronics,5,0,0,N,N,Good for the price.,Basic cable and works fine.,2014-08-05
26696,US,16111170,R4TR1HZ6N6R5M,B00C3MNMSU,164127268,Ultralast ULUBC1 Universal Li-Ion/NiCd/NiMh Battery Charger,Electronics,1,0,0,N,N,Bought this locally. It worked all of 2 days ...,Bought this locally. It worked all of 2 days before it stopped charging the batteries.,2015-08-23


In [13]:
def convert_to_int(x):
    return int(float(x))

In [14]:
#convert all mixed datatypes --> string objects (unable to convert to int)
one_file['customer_id'] = one_file['customer_id'].apply(int) #1
one_file['product_parent'] = one_file['product_parent'].apply(int) #4
one_file['star_rating'] = one_file['star_rating'].apply(int) #7
one_file['helpful_votes'] = one_file['helpful_votes'].apply(int) #8
one_file['total_votes'] = one_file['total_votes'].apply(int) #9

one_file.isnull().sum().sort_values(ascending=False)

one_file.dropna(inplace=True) #drop all of the missing values
one_file.isnull().sum().sort_values(ascending=False)

In [15]:
#there may be tabs in the review that is what is leading to the excluded cases when converting to df

'''
train_data['verified_purchase'].value_counts()
test_data['verified_purchase'].value_counts()
'''

one_file.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1235505,US,47030502,R3S7DOBICFGI93,B00HCMZ19Y,574974882,SanDisk Clip Jam MP3 Player,Electronics,5,0,0,N,Y,Five Stars,Espectacualr,2014-07-30
1026107,US,34165510,R1HZ8C8NS3TH4P,B002MVH7EC,269120324,WKA12-7.5F Sealed Battery Replacement 12V 8ah F2,Electronics,4,0,0,N,Y,Works fine. It was installed several months ago,Works fine. It was installed several months ago.,2014-10-19
810438,US,27204588,R136FE83BZEB3N,B00JA9EQMU,966915237,"Kocaso HP-500 Headphones, Black",Electronics,5,0,0,N,Y,Wonderful!,"Wow! I love them. I took them out as soon as I received it and have not taken them off since. Great sound, excellent definition and I only have it on 3. Nice, clear, deep base without it being distorted. Decided to try it to on Michael Jackson and I hear everything with great clarity. It is also nice that the cords are tangle free and plug in from both ends. The mic works well on my smartphone too! For the price, it was well worth every penny. Will order a second pair within a few weeks for my son as a gift.",2014-12-31
77285,US,20165378,R2S44U56RGPXVK,B00000JBRV,624246321,Labtec LVA7330 ClearVoice Head Microphone (Discontinued by Manufacturer),Electronics,5,0,0,N,Y,"Just a microphone, but a good one","My bad for not reading the details, but I thought I was buying a headset with headphones and microphone. Once I got over this disappointment, i was pleased with the noise-dampening feature.",2015-08-08
1115085,US,32146146,R140AHRJD42L78,B003DKL544,620199611,Philips In-Ear Headphones Mix and Match with 5 Sets of Interchangeable Caps,Electronics,5,0,0,N,Y,they don't make headphones like this anymore,"Get them while you can, they don't make headphones like this anymore.",2014-09-14


#to check te datatypes within each column

print(train_data.applymap(type))
data_types = train_data.applymap(type)

In [16]:
#the number of customers that gave multiple reviews
#may be valueable to investigate
one_file.groupby('customer_id').count()['marketplace'].sort_values(ascending = False).value_counts()

1     388070
2      34469
3       7139
4       2262
5        915
6        424
7        226
8        106
9         69
10        51
11        33
13        17
12        16
15        12
14         7
17         6
16         4
19         3
25         3
20         2
22         2
35         2
37         2
18         1
21         1
23         1
24         1
27         1
29         1
30         1
34         1
61         1
Name: marketplace, dtype: int64

In [17]:
one_file['star_rating'].value_counts()

5    305208
4     77251
1     55175
3     36149
2     26217
Name: star_rating, dtype: int64

Data Cleaning

In [18]:
# Import re for text cleaning purposes
import re

import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def df_cleaning(df, col):
    # Drop rows with na values
    df.dropna(inplace = True)
    
    new_col_name = 'new_' + col
    
    df[new_col_name] = df[col].copy() 
    
    # Remove unwanted formatting characters
    format_strs = dict.fromkeys(['<br /><br />', '&#34', 'br', '&quot', '<br />'], ' ')
    
    for key in format_strs:
        df[new_col_name] = df[new_col_name].apply(lambda review: review.replace(key, format_strs[key]))
    # removing quotes produces smthg like this --> 'The product has great ;sound; --> we must remove punctuation

    
    # Case normalization (lower case)
    df[new_col_name] = df[new_col_name].str.lower()
    
    remove_dict = {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "",
                   "(": "", ")":""}
    for key, val in remove_dict.items():
        df[new_col_name] = df[new_col_name].apply(
            lambda x: x.replace(key, val))
        
    # Remove stopwords
    stop_lst = stopwords.words('english')
    #stop_lst += (["can't", "i'm" "i'd", "i've", "i'll", "that's", "there's", "they're"])
    # ****Do we not have to take stopwords out BEFORE removing punctuation? Otherwise words with punct like “cant” remains there
    df[new_col_name] = df[new_col_name].apply(lambda text_body: " ".join([word for word in text_body.split() if word not in (stop_lst)]))
    
    # Removing Unicode Chars (punctuation, URL, @)
    df[new_col_name] = df[new_col_name].apply(lambda rev: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", rev))
    
    # Lemmatization
    word_lemmatizer = WordNetLemmatizer()
    df[new_col_name] = df[new_col_name].apply(lambda txt: " ".join([(word_lemmatizer.lemmatize(word)) for word in txt.split()]))
    
    return df

cleaned = df_cleaning(one_file)

In [20]:
cleaned2 = df_cleaning(one_file, 'review_body')

In [21]:
cleaned2.get(['review_body', 'new_review_body']).head()

Unnamed: 0,review_body,new_review_body
1235505,Espectacualr,espectacualr
1026107,Works fine. It was installed several months ago.,work fine installed several month ago
810438,"Wow! I love them. I took them out as soon as I received it and have not taken them off since. Great sound, excellent definition and I only have it on 3. Nice, clear, deep base without it being distorted. Decided to try it to on Michael Jackson and I hear everything with great clarity. It is also nice that the cords are tangle free and plug in from both ends. The mic works well on my smartphone too! For the price, it was well worth every penny. Will order a second pair within a few weeks for my son as a gift.",wow love them took soon received taken since great sound excellent definition nice clear deep base without distorted decided try michael jackson hear everything great clarity also nice cord tangle free plug end mic work well smartphone too price well worth every penny order second pair within week son gift
77285,"My bad for not reading the details, but I thought I was buying a headset with headphones and microphone. Once I got over this disappointment, i was pleased with the noise-dampening feature.",bad reading detail thought buying headset headphone microphone got disappointment pleased noisedampening feature
1115085,"Get them while you can, they don't make headphones like this anymore.",get can make headphone like anymore


In [22]:
cleaned2 = df_cleaning(one_file, 'review_headline')

In [23]:
cleaned2.get(['review_headline', 'new_review_headline']).head()

Unnamed: 0,review_headline,new_review_headline
1235505,Five Stars,five star
1026107,Works fine. It was installed several months ago,work fine installed several month ago
810438,Wonderful!,wonderful
77285,"Just a microphone, but a good one",microphone good one
1115085,they don't make headphones like this anymore,make headphone like anymore


In [24]:
cleaned2 = df_cleaning(one_file, 'product_title')

In [25]:
cleaned2.get(['product_title', 'new_product_title']).head()

Unnamed: 0,product_title,new_product_title
1235505,SanDisk Clip Jam MP3 Player,sandisk clip jam mp player
1026107,WKA12-7.5F Sealed Battery Replacement 12V 8ah F2,wkaf sealed battery replacement v ah f
810438,"Kocaso HP-500 Headphones, Black",kocaso hp headphone black
77285,Labtec LVA7330 ClearVoice Head Microphone (Discontinued by Manufacturer),labtec lva clearvoice head microphone discontinued manufacturer
1115085,Philips In-Ear Headphones Mix and Match with 5 Sets of Interchangeable Caps,philip inear headphone mix match set interchangeable cap


In [26]:
cleaned2.iloc[0].get('review_body')

'Espectacualr'

### Vader Sentiment Analysis

In [27]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def get_sentiment_scores(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    #print(f"{sentence} {str(snt)}")
    return snt

In [28]:
get_sentiment_scores(cleaned2.iloc[0].get('review_body')) #without lemmitization

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [29]:
get_sentiment_scores(cleaned2.iloc[0].get('new_review_body')) #with lemmitization

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [30]:
one_file = cleaned2

In [31]:
one_file['rev_dict'] = one_file['new_review_body'].apply(get_sentiment_scores)

In [32]:
def get_neg(review_dict):
    return review_dict['neg']

def get_neu(review_dict):
    return review_dict['neu']

def get_pos(review_dict):
    return review_dict['pos']

def get_compound(review_dict):
    return review_dict['compound']

def only_compound(x):
    dct = get_sentiment_scores(x)
    return dct['compound']

In [33]:
#get neg prop
one_file['neg_prop'] = one_file['rev_dict'].apply(get_neg)
#get neu prop
one_file['neu_prop'] = one_file['rev_dict'].apply(get_neu)
#get pos prop
one_file['pos_prop'] = one_file['rev_dict'].apply(get_pos)
#get compound prop
one_file['compound_prop'] = one_file['rev_dict'].apply(get_compound)

In [34]:
#save the dataframe as a csv file
one_file.to_csv('electronics_data_cleaned.csv')

### Sentiment Analysis

#df is all data other than rating 3
df = one_file[one_file['star_rating'] != 3]

df['star_rating'].value_counts()

df['positively_rated'] = np.where(df['star_rating'] > 3, 1, 0)

df['positively_rated'].value_counts()

sns.countplot(df['positively_rated'])

##### On all of the data

train_data, test_data = np.split(one_file.sample(frac=1, random_state=1729), [int(0.7 * len(one_file))])
print(train_data.shape, test_data.shape)

X_train = train_data['review_body']
X_train.iloc[0] #to_frame()

Y_train = train_data['star_rating']

X_test = test_data['review_body']
Y_test = test_data['star_rating']

#### TF-IDF - to get predictions of what star rating will be based on text of review

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer().fit(X_train)

len(vect.get_feature_names()) #unique words

X_train_vect = vect.transform(X_train)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vect, Y_train)

predictions = model.predict(vect.transform(X_test))

len(predictions)

###### Confusion Matrix - shows probabilities for whether cat data is predicted correctly

In [35]:
def plot_confusion_matrix(cm, target_names,
                          fname, epoch,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True, target=None):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
    plt.style.use('default')

    # # only true if it weren't normalized:
    # accuracy = np.trace(cm) / float(np.sum(cm))
    # misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm[np.isnan(cm)] = 0.0

    fig = plt.figure(figsize=(5, 4))
    ax = plt.axes()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    if target == "rule-based":
        plt.title(title + ' for rule-based PF')
    else:
        plt.title(title + ' for MLPF at epoch ' + str(epoch))

    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlim(-1, len(target_names))
    plt.ylim(-1, len(target_names))
    plt.xlabel('Predicted label')
    # plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    plt.savefig(fname + '.png')
    plt.savefig(fname + '.pdf')
    #plt.close(fig)

    return fig, ax

### SVM Notes

- **Bag of Words (BoW)**
        - BoW encodes an input sentence as the frequency of each word in the sentence. 
        - In this approach, all words contribute equally to the feature vectors.
- **Term Frequency - Inverse Document Frequency (TF-IDF)**
        - TF-IDF is a measure of how important each term is to a specific document, as compared to an overall corpus. 
        - TF-IDF encodes each word as its frequency in the document of interest, divided by a measure of how common the word is across all documents (the corpus).
        - Using this approach, each word contributes differently to the feature vectors.
        - The assumption behind using TF-IDF is that words that appear commonly everywhere are not that informative about what is specifically interesting about a document of interest, so it is tuned to representing a document in terms of the words it uses that are different from other documents. 

- To compare those 2 methods, we will first apply them on the same dataset to analyse sentiment (how positive or negative a text is). In order to make the comparison fair, an **SVM (support vector machine)** classifier will be used to classify positive reviews and negative reviews.

- SVM is a simple yet powerful and interpretable linear model. To use it as a classifier, we need to have at least 2 splits of the data: training data and test data. The training data is used to tune the weight parameters in the SVM to learn an optimal way to classify the training data. We can then test this trained SVM classifier on the test data, to see how well it works on data that the classifier has not seen before. 

We will now create a CountVectorizer object to transform the text data into vectors with numerical values.

To do so, we will initialize a CountVectorizer object, and name it as vectorizer.

4 arguments to initialize a CountVectorizer:

* analyzer: 'word'
 Specify to analyze data from word-level.
 
* max_features: 2000
 Set a max number of unique words.
 
* tokenizer: word_tokenize
 Set to tokenize the text data by using the word_tokenizer from NLTK .
 
* stop_words: stopwords.words('english')
 Set to remove all stopwords in English. We do this since they generally don't provide useful discriminative information.

## KNN

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [37]:
one_file.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'new_review_body',
       'new_review_headline', 'new_product_title', 'rev_dict', 'neg_prop',
       'neu_prop', 'pos_prop', 'compound_prop'],
      dtype='object')

In [38]:
def convert_to_id(x):
    alphabet = {'a': 1, 'c': 3, 'b': 2, 'e': 5, 'd': 4, 'g': 7, 'f': 6, 'i': 9, 'h': 8, 'k': 11, 'j': 10, 'm': 13, 'l': 12, 'o': 15, 'n': 14, 'q': 17, 'p': 16, 's': 19, 'r': 18, 'u': 21, 't': 20, 'w': 23, 'v': 22, 'y': 25, 'x': 24, 'z': 26}
    out = ''
    for i in x:
        if i.lower() in alphabet.keys():
            out += str(alphabet[i.lower()])
        else:
            out += i
    return out

In [39]:
one_file['product_id_convert'] = one_file.get("product_id").apply(convert_to_id)

In [40]:
one_file['product_category_convert'] = one_file.get("product_category").apply(convert_to_id)

In [41]:
one_file['prod_title_comp'] = one_file.get("new_product_title").apply(only_compound)

In [42]:
one_file['rev_title_comp'] = one_file.get("new_review_headline").apply(only_compound)

def help_prop(vals):
    if vals[1] == 0:
        return 0.5
    else:
        return vals[0] / vals[1]

In [43]:
#NEED TO THINK OF WHAT WOULD HAPPEN IF THE NUMBER OF TOTAL VOTES WOULD BE 0
one_file['help_prop'] = one_file.get("helpful_votes") / one_file.get("total_votes")
#one_file['help_prop'] = one_file.get("helpful_votes").apply(help_prop)

In [44]:
one_file.iloc[2].to_frame()

Unnamed: 0,810438
marketplace,US
customer_id,27204588
review_id,R136FE83BZEB3N
product_id,B00JA9EQMU
product_parent,966915237
product_title,"Kocaso HP-500 Headphones, Black"
product_category,Electronics
star_rating,5
helpful_votes,0
total_votes,0


In [45]:
def id_for_dictionary(dic):
    if len(dic) == 4:
        ind = list(dic.values()).index(max(list(dic.values())[0:-1])) #remove the compound
    else:
        ind = list(dic.values()).index(max(list(dic.values())))
        
    if ind == 1:
        return 0
    elif ind == 0:
        return -1
    else:
        return 1

In [46]:
def id_for_prop(prop):
    if prop < 0.45:
        return -1
    elif prop > 0.55:
        return 1
    else:
        return 0

In [47]:
one_file['rev_bod_id'] = one_file.get("rev_dict").apply(id_for_dictionary)

In [48]:
one_file['help_prop_id'] = one_file.get("help_prop").apply(id_for_prop)

find percentages for the body -- text and display those in a column -- act as labels for the percent positivity of the review body and header themselves

In [49]:
#imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'neg_prop', 'neu_prop', 'pos_prop', 'help_prop']]
imp_col = one_file[['verified_purchase', 'prod_title_comp', 'star_rating', 'rev_title_comp', 'rev_bod_id', 'help_prop_id']]

In [50]:
imp_col.dtypes

verified_purchase     object
prod_title_comp      float64
star_rating            int64
rev_title_comp       float64
rev_bod_id             int64
help_prop_id           int64
dtype: object

In [51]:
#need to replace review_body and review_headline with percentages for sentiment -- easier to use for classification
#vine needs to be converted to a yes/no binary column similar to how positively rated was determined
#use original dataset with rating 3 available

#X = imp_col.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values
X = imp_col.iloc[:, [1,2,3,4,5]].values #only taking in the categories that will be used as a dataframe
y = imp_col.iloc[:, 0].values

"""
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,1] = le.fit_transform(X[:,1])
"""

'\nfrom sklearn.preprocessing import LabelEncoder\nle = LabelEncoder()\nX[:,1] = le.fit_transform(X[:,1])\n'

In [52]:
X

array([[0.    , 5.    , 0.    , 0.    , 0.    ],
       [0.    , 4.    , 0.2023, 0.    , 0.    ],
       [0.    , 5.    , 0.5719, 1.    , 0.    ],
       ...,
       [0.    , 3.    , 0.    , 0.    , 0.    ],
       [0.    , 5.    , 0.4404, 1.    , 0.    ],
       [0.4215, 5.    , 0.    , 1.    , 0.    ]])

#Since our dataset containing character variables we have to encode it using LabelEncoder

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,2] = le.fit_transform(X[:,2])

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [54]:
# Next, we are doing feature scaling to the training and test set of independent variables for reducing the size to smaller values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [55]:
from sklearn.neighbors import KNeighborsClassifier

#we are using 
#5 neighborhood points are required for classifying a given point -- distance metric is using the minkonowski equation
knn_classifier = KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean', p = 1)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=20, p=1)

In [56]:
y_pred = knn_classifier.predict(X_test)

In [57]:
#We can evaluate our model using the confusion matrix and accuracy score by comparing the predicted and actual test values

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [58]:
print(cm)

[[   23  9662]
 [   47 90262]]


In [59]:
print(ac)

0.902904174250455


In [60]:
#Can see the model performance and add more features accordingly -- 
#would be good if the performance is greater than 85%

### Test on a product review - Need to write a *function* for this for taking in the user input

The features that we are looking at!
* 'prod_title_comp', 
* 'star_rating', 
* 'rev_title_comp', 
* 'rev_bod_id', 
* 'help_prop_id'

In [61]:
review_body = "this is a good review"
product_title = "Sony Headphones"
review_title = 'Love the product!'

star_rating = 5
helpful_votes = 1
total_votes = 1

In [62]:
test = pd.DataFrame()
test['review_body'] = np.array([review_body])
test['review_title'] = np.array([review_title])
test['product_title'] = np.array([product_title])
test

Unnamed: 0,review_body,review_title,product_title
0,this is a good review,Love the product!,Sony Headphones


In [63]:
out = df_cleaning(test, 'review_body')
out = df_cleaning(out, 'review_title')
out = df_cleaning(out, 'product_title')
out

Unnamed: 0,review_body,review_title,product_title,new_review_body,new_review_title,new_product_title
0,this is a good review,Love the product!,Sony Headphones,good review,love product,sony headphone


In [64]:
out['review_body'][0]

'this is a good review'

def get_sentiment_proportions(review):
    """
    create new dataframe with just the proportions for each review
    four columns
    neg_prop, pos_prop, neu_prop, compound_prop and will contain these values
    obtained from the vator sentiment algorithm
    """
    snt = analyser.polarity_scores(review)
    #print(f"{sentence} {str(snt)}")
    neg = snt['neg']
    neu = snt['neu']
    pos = snt['pos']
    #compound = snt['compound']
    return neg, neu, pos

neg, neu, pos = get_sentiment_proportions(out.get("new_review_body").iloc[0])

product_category = convert_to_id(product_category)
product_title = only_compound(product_title)
rev_title = only_compound(review_title)

In [65]:
rev_bod_id = id_for_dictionary(analyser.polarity_scores(out['new_review_body'][0]))
help_prop_id = id_for_prop(helpful_votes / total_votes)
prod_title_comp = only_compound(out['new_review_title'][0])
rev_title_comp = only_compound(out['new_product_title'][0])

Predicted: 'verified_purchase'

User Input: 'prod_title_comp', 'product_category_convert', 'star_rating', 'helpful_votes', 'total_votes', 'neg_prop', 'neu_prop', 'pos_prop'
- 8 fields

In [66]:
rev_input_test = np.array([[prod_title_comp, star_rating, rev_title_comp, rev_bod_id, help_prop_id]])
rev_input_test

array([[0.6369, 5.    , 0.    , 1.    , 1.    ]])

In [67]:
prediction, probabilities = knn_classifier.predict(rev_input_test), knn_classifier.predict_proba(rev_input_test)[0]

In [68]:
prediction

array(['Y'], dtype=object)

In [69]:
probabilities

array([0.1, 0.9])

classifier?

In [70]:
def interpret_prediction(review, pred, proba):
    proba = [round(proba[0], 3), round(proba[1], 3)]
    if prediction[0] == 'Y':
        print(f'"{review}" is predicted to be a VERIFIED review, with {proba[1]*100}% probability of being VERIFIED and {proba[0]*100}% probability of being UNVERIFIED')
    if prediction[0] == 'N':
        print(f'"{review}" is predicted to be an UNVERIFIED review, with {proba[0]*100}% probability of being UNVERIFIED and {proba[1]*100}% probability of being VERIFIED')
        
interpret_prediction(review_test, prediction, probabilities)

NameError: name 'review_test' is not defined

In [None]:
from joblib import dump, load

In [None]:
knn_classifier

In [None]:
name = 'knn_working_model_2.joblib'
path = 'KNNModelFiles/'
dump(knn_classifier, path+name)

In [None]:
knn_classifier = load(path+name)