In [161]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
import numpy as np
import time
%matplotlib inline

In [2]:
snowball = SnowballStemmer('english')

In [9]:
df = pd.read_csv("s3://dogfaces/reviews/extract_breed_review.csv")

In [10]:
df.head()

Unnamed: 0,rating,review_id,review_time,toy_id,toy_name,user_name,breed_extract
0,5,185682550,"Sep 4, 2017",108574,mammoth-cottonblend-5-knot-dog-rope,Gunnerfavorite,
1,5,185591533,"Sep 1, 2017",108574,mammoth-cottonblend-5-knot-dog-rope,Lindsay,weimaraner
2,4,185522967,"Aug 29, 2017",108574,mammoth-cottonblend-5-knot-dog-rope,bcodpas3,"border terriers,border collie"
3,5,185487336,"Aug 28, 2017",108574,mammoth-cottonblend-5-knot-dog-rope,SickPup,
4,5,185484167,"Aug 28, 2017",108574,mammoth-cottonblend-5-knot-dog-rope,AussieMom,"australian cattle dogs,australian shepherd"


#### Score mechanism

input: a probability vector of dog breeds top 3:  
> toy -> breed score(averaged score for that breed)  
  return probability weighted review scores


In [11]:
# 52 base classes:
# source 2: classified dog names
breed_classes = pd.read_csv("s3://dogfaces/tensor_model/output_labels_20170907.txt",names=['breed'])
base_breeds = breed_classes['breed'].values

In [12]:
base_breeds

array(['rottweiler', 'bull mastiff', 'french bulldog', 'cairn',
       'yorkshire terrier', 'great dane', 'standard poodle', 'malinois',
       'wheaten terrier', 'collie', 'papillon', 'weimaraner', 'samoyed',
       'doberman', 'shetland sheepdog', 'whippet', 'american bulldog',
       'beagle', 'chihuahua', 'chesapeake bay retriever',
       'golden retriever', 'american pit bull terrier', 'english foxhound',
       'vizsla', 'great pyrenees', 'basset', 'miniature schnauzer',
       'siberian husky', 'pomeranian', 'english springer', 'pug',
       'bernese mountain dog', 'miniature poodle',
       'soft coated wheaten terrier', 'tibetan mastiff', 'havanese',
       'miniature pinscher', 'border collie', 'toy poodle', 'bloodhound',
       'german shorthaired', 'malamute', 'labrador retriever',
       'german shepherd', 'chow', 'scottish terrier', 'boxer', 'shih tzu',
       'basset hound', 'shiba inu', 'newfoundland', 'rhodesian ridgeback',
       'west highland white terrier'], dtype

In [13]:
with open('breed_lookup.pickle', 'rb') as handle:
    rev_to_breed = pickle.load(handle)
len(rev_to_breed)

604

In [14]:
with open('breed_dict.pickle', 'rb') as handle:
    breed_to_rev = pickle.load(handle)
len(breed_to_rev)

163

In [35]:
# sanity check
not_found = 0
for breed in base_breeds:
    if breed not in breed_to_rev:
        if snowball.stem(breed) in breed_to_rev:
            print "only need to stem "+breed
        elif snowball.stem(breed) in rev_to_breed:
            print "need to look up extened dict "+ breed +" : "+str(rev_to_breed[snowball.stem(breed)])
        else:
            print "not found " + breed
            not_found += 1
print not_found

0


#### Get each base breeds score

In [62]:
mini_set = df.sample(10).copy()
base_breeds_set = set(base_breeds)

In [63]:
# review_id, toy_id, breeds.....
def get_breed_score(df):
    score_df = []
    for idx, row in df.iterrows():
        score_row = {}
        score_row['review_id'] = row['review_id']
        score_row['toy_id'] = row['toy_id']
        score_row['rating'] = row['rating']
        try:
            breed_extract = row['breed_extract'].split(',')
            matched_item = {}
            
            for b in breed_extract:
                if b in base_breeds_set:
                    matched_item[b] = matched_item.get(b,0)+1
            max_p = max(matched_item.values())
            total_base = 0
            
            for k, v in matched_item.iteritems():
                if v== max_p:
                    total_base += 1
                   
            for k, v in matched_item.iteritems():
                if v == max_p:
                    score_row[k] = 1.0/total_base
        except:
            pass
        score_df.append(score_row) 
    return score_df

In [65]:
scored_lst = get_breed_score(df)

In [66]:
scored_df = pd.DataFrame(scored_lst)

In [71]:
scored_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61202 entries, 0 to 61201
Data columns (total 56 columns):
american bulldog               729 non-null float64
american pit bull terrier      2523 non-null float64
basset                         340 non-null float64
basset hound                   304 non-null float64
beagle                         336 non-null float64
bernese mountain dog           94 non-null float64
bloodhound                     35 non-null float64
border collie                  622 non-null float64
boxer                          844 non-null float64
bull mastiff                   46 non-null float64
cairn                          945 non-null float64
chesapeake bay retriever       502 non-null float64
chihuahua                      445 non-null float64
chow                           39 non-null float64
collie                         156 non-null float64
doberman                       510 non-null float64
english foxhound               443 non-null float64
english sp

In [72]:
scored_df.fillna(0, inplace=True)

In [73]:
scored_df.head()

Unnamed: 0,american bulldog,american pit bull terrier,basset,basset hound,beagle,bernese mountain dog,bloodhound,border collie,boxer,bull mastiff,...,standard poodle,tibetan mastiff,toy poodle,toy_id,vizsla,weimaraner,west highland white terrier,wheaten terrier,whippet,yorkshire terrier
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,108574,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,108574,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,108574,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,108574,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,108574,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
save_data = scored_df.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/scored_breed_review.csv', Body=save_data)

NameError: name 'scored_df' is not defined

In [4]:
# sanity check
scored_df = pd.read_csv("s3://dogfaces/reviews/scored_breed_review.csv")

In [5]:
scored_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61202 entries, 0 to 61201
Data columns (total 56 columns):
american bulldog               61202 non-null float64
american pit bull terrier      61202 non-null float64
basset                         61202 non-null float64
basset hound                   61202 non-null float64
beagle                         61202 non-null float64
bernese mountain dog           61202 non-null float64
bloodhound                     61202 non-null float64
border collie                  61202 non-null float64
boxer                          61202 non-null float64
bull mastiff                   61202 non-null float64
cairn                          61202 non-null float64
chesapeake bay retriever       61202 non-null float64
chihuahua                      61202 non-null float64
chow                           61202 non-null float64
collie                         61202 non-null float64
doberman                       61202 non-null float64
english foxhound           

#### Model version 1: average

In [6]:
# calculating each toy's score
#df_scored = scored_df.copy()
df_scored = scored_df.copy()
df_scored.pop('review_id')
df_scored.pop('rating')
def non_zero_count(x):
    return np.sum(x[x>0])
df_breed_count = df_scored.groupby('toy_id').agg(non_zero_count).reset_index()
df_breed_count.head()

Unnamed: 0,toy_id,american bulldog,american pit bull terrier,basset,basset hound,beagle,bernese mountain dog,bloodhound,border collie,boxer,...,soft coated wheaten terrier,standard poodle,tibetan mastiff,toy poodle,vizsla,weimaraner,west highland white terrier,wheaten terrier,whippet,yorkshire terrier
0,38347,0.0,0.428571,0.5,0.5,0.0,0.0,0.0,2.0,2.0,...,0.428571,2.0,0.0,0.0,0.0,0.0,0.428571,0.428571,0.0,0.428571
1,38359,0.0,1.142857,0.0,0.0,0.5,0.0,0.0,3.0,2.0,...,0.142857,3.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,1.142857
2,38362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38365,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.5,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38371,2.0,17.087302,0.5,0.5,1.0,1.0,0.0,6.5,5.5,...,1.253968,4.333333,0.611111,0.333333,0.0,0.0,3.253968,1.253968,0.0,1.753968


In [7]:
breed_columns = [x for x in scored_df.columns if x not in ['toy_id', 'rating', 'review_id']]
mat_scored2 = scored_df[breed_columns].copy().values

In [9]:
mat_scored2 = scored_df['rating'].values.reshape((61202,1))*mat_scored2

In [11]:
df_scored_sum = pd.DataFrame(data=mat_scored2, columns=breed_columns)
df_scored_sum = pd.concat([scored_df['toy_id'].copy(), df_scored_sum], axis=1)

In [15]:
df_breed_wet_sum = df_scored_sum.groupby('toy_id').sum().reset_index()
df_breed_wet_sum.head()

Unnamed: 0,toy_id,american bulldog,american pit bull terrier,basset,basset hound,beagle,bernese mountain dog,bloodhound,border collie,boxer,...,soft coated wheaten terrier,standard poodle,tibetan mastiff,toy poodle,vizsla,weimaraner,west highland white terrier,wheaten terrier,whippet,yorkshire terrier
0,38347,0.0,2.142857,2.5,2.5,0.0,0.0,0.0,10.0,10.0,...,2.142857,9.0,0.0,0.0,0.0,0.0,2.142857,2.142857,0.0,2.142857
1,38359,0.0,5.714286,0.0,0.0,2.5,0.0,0.0,10.0,10.0,...,0.714286,15.0,0.0,0.0,0.0,0.0,0.714286,0.714286,0.0,5.714286
2,38362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38365,0.0,5.0,0.0,0.0,0.0,5.0,0.0,12.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38371,10.0,84.936508,2.5,2.5,5.0,5.0,0.0,32.5,27.5,...,6.269841,21.666667,3.055556,1.666667,0.0,0.0,16.269841,6.269841,0.0,8.769841


In [20]:
df_breed_wet_sum.sort_values(by='toy_id', axis=0, inplace=True)
df_breed_count.sort_values(by='toy_id', axis=0, inplace=True)

In [22]:
weighted_mat = df_breed_count[breed_columns].values
weighted_sum = df_breed_wet_sum[breed_columns].values
with np.errstate(divide='ignore', invalid='ignore'):
    res_mat = np.true_divide(weighted_sum, weighted_mat)
    res_mat[res_mat==np.inf]=0
    res_mat = np.nan_to_num(res_mat)

In [25]:
df_scored_finalscore = pd.DataFrame(data=res_mat, columns=breed_columns)
df_scored_finalscore = pd.concat([df_breed_count['toy_id'].copy(), df_scored_finalscore], axis=1)

In [26]:
df_scored_finalscore.head()

Unnamed: 0,toy_id,american bulldog,american pit bull terrier,basset,basset hound,beagle,bernese mountain dog,bloodhound,border collie,boxer,...,soft coated wheaten terrier,standard poodle,tibetan mastiff,toy poodle,vizsla,weimaraner,west highland white terrier,wheaten terrier,whippet,yorkshire terrier
0,38347,0.0,5.0,5.0,5.0,0.0,0.0,0.0,5.0,5.0,...,5.0,4.5,0.0,0.0,0.0,0.0,5.0,5.0,0.0,5.0
1,38359,0.0,5.0,0.0,0.0,5.0,0.0,0.0,3.333333,5.0,...,5.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,5.0
2,38362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38365,0.0,5.0,0.0,0.0,0.0,5.0,0.0,4.8,0.0,...,0.0,3.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38371,5.0,4.970739,5.0,5.0,5.0,5.0,0.0,5.0,5.0,...,5.0,5.0,5.0,5.0,0.0,0.0,5.0,5.0,0.0,5.0


In [31]:
df_toy = pd.read_csv("s3://dogfaces/reviews/toys.csv")
df_toy.head(3)

Unnamed: 0,cat_id,num_reviews,picture_link,price,toy_id,toy_link,toy_name
0,1,800,https://img.chewy.com/is/catalog/62758_MAIN._A...,$1.19,47728,https://www.chewy.com/kong-airdog-squeakair-ba...,kong-airdog-squeakair-ball-dog-toy
1,1,127,https://img.chewy.com/is/catalog/80753._AC_SS1...,$2.99,108582,https://www.chewy.com/mammoth-monkey-fist-bar-...,mammoth-monkey-fist-bar-dog-toy
2,1,292,https://img.chewy.com/is/catalog/62850._AC_SS1...,$3.39,47880,https://www.chewy.com/kong-squeakair-birthday-...,kong-squeakair-birthday-balls-dog


In [None]:
# make recommendations:
def getRecommendations(probs, score_df, toy_df, k, add_info=None):
    # probs is a dictionary
    keys = probs.keys()
    D = score_df.shape[1]-1
    prob_v = np.array(probs.values()).reshape((D,1))
    score_mat = score_df[keys].values
    fscore_mat = score_mat.dot(prob_v)
    top_ind = np.argsort(-fscore_mat[:,0])[:k]
    top_toy = score_df['toy_id'].values[top_ind]
    likely_ratings = pd.DataFrame({"likely rating":fscore_mat[:,0][top_ind]}, index=None)
    if not add_info:
        toy_info = toy_df[toy_df['toy_id'].isin(top_toy)][['toy_id','toy_name','price']].copy()
    else:
        add_info.extend(['toy_id','toy_name','price'])
        toy_info = toy_df[toy_df['toy_id'].isin(top_toy)][add_info].copy()
    return pd.concat([toy_info.reset_index(), likely_ratings], axis=1)
def getRecommendedToys():
    pass
def getToyDislie():
    pass

In [163]:
# get recommendations
for i in xrange(53):
    probs = [0]*53
    ind = i#np.random.randint(53)
    probs[ind]=1
    print breed_columns[ind]
    test_input = dict(zip(breed_columns, probs))
    print getRecommendations(test_input,df_scored_finalscore, df_toy, 3, ['toy_link'] )
    time.sleep(2)

american bulldog
   index                                           toy_link  toy_id  \
0     17  https://www.chewy.com/gnawsome-squeaker-footba...  156822   
1    193  https://www.chewy.com/chuckit-fumble-fetch-sma...   49740   
2    266  https://www.chewy.com/chuckit-fetch-ball-dog-t...   49747   

                             toy_name   price  likely rating  
0  gnawsome-squeaker-football-dog-toy   $2.99            5.0  
1          chuckit-fumble-fetch-small  $17.36            5.0  
2    chuckit-fetch-ball-dog-toy-color   $7.95            5.0  
american pit bull terrier
   index                                           toy_link  toy_id  \
0    209  https://www.chewy.com/jolly-pets-jolly-jumper-...  102390   
1    264  https://www.chewy.com/jw-pet-treat-puzzler-bal...   55977   
2    321  https://www.chewy.com/multipet-boingo-ball-dog...   52952   

                               toy_name  price  likely rating  
0  jolly-pets-jolly-jumper-ball-dog-toy  $6.59            5.0  
1     j

In [164]:
save_data = df_scored_finalscore.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/scored_breed_toy.csv', Body=save_data)

s3.Object(bucket_name='dogfaces', key='reviews/scored_breed_toy.csv')