In [1]:
#import basic Libraries
import pandas as pd
import numpy as np
import random

#Importing Sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity 

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
#import of all files from the data folder.
aisles = pd.read_csv('./data/aisles.csv')
departments = pd.read_csv('./data/departments.csv')
products = pd.read_csv('./data/products.csv')
order_products__prior = pd.read_csv('./data/order_products__prior.csv')
order_products__train = pd.read_csv('./data/order_products__train.csv')
orders = pd.read_csv('./data/orders.csv')

In [4]:
opt = order_products__train.merge(products,how='left', on='product_id')
opt = opt.merge(departments,how='left', on='department_id')
opt = opt.merge(aisles,how='left', on='aisle_id')

# Recommender system

In [5]:
reorders = opt[opt['reordered'] == 1]

In [6]:
reorders['product_id'] = reorders['product_id'].astype('int64')

In [7]:
# get list of hi volume products (products that occurr more than 1 time)
hivol = reorders.copy()['product_id'].value_counts().sort_values(ascending=False)\
    [reorders.copy()['product_id'].value_counts().sort_values(ascending=False) > 1].index.tolist()

In [8]:
reorders = reorders[reorders['product_id'].isin(hivol)]

In [9]:
#filters the High demand items greater than.
reorders['hi_dem'] = (reorders.copy()['product_id'].value_counts().sort_values(ascending=False)>1)

In [11]:
hidem_ord = reorders[reorders['hi_dem'] == True]

In [12]:
user_orders = reorders.merge(orders)

In [13]:
user_orders['hi_dem'] = (user_orders.copy()['product_id'].value_counts().sort_values(ascending=False)>1)

In [14]:
hidem_ord = user_orders[user_orders['hi_dem'] == True]

In [19]:
#return the total items
users = hidem_ord.groupby(['user_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)

In [21]:
#creates a similiarity by users.
users_sim = pd.DataFrame(cosine_similarity(users),index=users.index,columns=users.index)

In [23]:
def next_prod(df, num_col):
    return df[df.columns[num_col]].drop(df.columns[num_col]).sort_values(ascending=False).head()

In [27]:
#returns similar users to this one.
pd.DataFrame(next_prod(users_sim, 56)).T

user_id,43254,48962,10453,202555,166997
1711,0.5,0.353553,0.188982,0.166667,0.158114


# Recommendations for Products by User ID

In [28]:
#return the total items in the basket from the aisles
products = hidem_ord.groupby(['product_name','user_id']).size().sort_values(ascending=False).unstack().fillna(0)

In [29]:
#creates a similiarity by users.
products_sim = pd.DataFrame(cosine_similarity(products),index=products.index,columns=products.index)

In [61]:
products.head()

user_id,55,56,66,79,110,215,223,249,283,285,...,206023,206040,206082,206111,206136,206140,206158,206162,206177,206186
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0 Calorie Fuji Apple Pear Water Beverage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0% Fat Free Organic Milk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0% Fat Organic Greek Vanilla Yogurt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0% Fat Strawberry Greek Yogurt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0% Greek Strained Yogurt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
#gives a recommendation for the last product added_to_cart
pd.DataFrame(next_prod(products_sim, 56)).T

product_name,Organic Maitake Mushrooms,Ruby Red Grapefruit Juice,Organic Heavy Cream,Organic Classic Rich Crackers,Thyme
100% Pomegranate Juice,0.5,0.353553,0.353553,0.288675,0.288675


In [56]:
profile = pd.Series(np.zeros(len(products.columns.tolist())),users_sim[55])

In [65]:
recommendations = np.dot(products.values,users_sim[56])

In [66]:
recommendations = pd.Series(recommendations, index=products.index)

In [67]:
recommendations.sort_values(ascending=False).head()

product_name
Large Lemon             48.067533
Organic Blackberries    14.086878
Banana                   6.675101
Organic Avocado          3.983752
Organic Baby Spinach     3.623921
dtype: float64

# Recommendations by Products

In [44]:
#return the total items
ords = hidem_ord.groupby(['order_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)

In [45]:
#creates a similiarity by users.
ords_sim = pd.DataFrame(cosine_similarity(ords),index=ords.index,columns=ords.index)

In [63]:
#displays which users are similar to the current user.
pd.DataFrame(next_prod(users_sim,56)).T

user_id,43254,48962,10453,202555,166997
1711,0.5,0.353553,0.188982,0.166667,0.158114


# Product Recommender by Order ID

In [52]:
baskets = hidem_ord.groupby(['product_name','order_id']).size().sort_values(ascending=False).unstack().fillna(0)

In [53]:
basket_sim = pd.DataFrame(cosine_similarity(baskets),columns=baskets.index,index=baskets.index)

In [68]:
basket_sim['Zucchini Squash'].sort_values(ascending=False).head(10)[1:]

product_name
Chocolate Cheerios Cereal                      0.577350
Organic Green Peas                             0.408248
Quick & Easy Steel Cut Irish Oatmeal           0.408248
Small Size Flour Tortillas                     0.333333
Raspberry on the Bottom Nonfat Greek Yogurt    0.235702
Sweet Onions                                   0.218218
Flat Parsley, Bunch                            0.174078
Green Beans                                    0.149071
Organic Kiwi                                   0.107211
Name: Zucchini Squash, dtype: float64

In [69]:
def Recommender_System(user_id):
    
    '''
    Input User Id to see recommendations for the User
    '''
    
    u = hidem_ord.groupby(['user_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)
    u_sim = pd.DataFrame(cosine_similarity(u),index=u.index,columns=u.index)

    p = hidem_ord.groupby(['product_name','user_id']).size().sort_values(ascending=False).unstack().fillna(0)
    products_sim = pd.DataFrame(cosine_similarity(p),index=p.index,columns=p.index)
    profile = pd.Series(np.zeros(len(u_sim.columns.tolist())),u_sim[user_id])
    
    recommendations = pd.Series(np.dot(p.values,users_sim[user_id]), index=p.index)
    return recommendations.sort_values(ascending=False).head(), 
    

In [70]:
#gives a random user ID to for input to the recommender system
random.sample(hidem_ord['user_id'].tolist(),1)

[57996]

In [71]:
Recommender_System(56)

(product_name
 Large Lemon             48.067533
 Organic Blackberries    14.086878
 Banana                   6.675101
 Organic Avocado          3.983752
 Organic Baby Spinach     3.623921
 dtype: float64,)