# Building a Recommendation System


## I would be trying these below methods for this dataset

1. Content Based Filtering
2. Collaberative Filtering
3. Hybrid Filtering

## Collaberative Based Filtering 
Recommends a product using User preference

In [4]:
# Importing Packages
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import sklearn.metrics.pairwise as pw

In [137]:
# Data Loading 
orders = pd.read_csv('orders_data.csv')
# Data Loading 
orders = orders[orders['eval_set']=='train']

## Selecting only 70 departments data since I cannot run for entire dataset

In [138]:
departments = pd.DataFrame(orders['aisle-department'].value_counts().sort_values(ascending=True)[:70])
departments = departments.reset_index()
departments.columns = ['aisle-department','count']
departments.head(5)

Unnamed: 0,aisle-department,count
0,beauty personal care,287
1,frozenjuice frozen,294
2,babyaccessories babies,306
3,babybathbodycare babies,328
4,kitchensupplies household,448


In [139]:
# removing data if the qty is less than 5
orders = orders[orders['add_to_cart_order']>=5]

In [140]:
# Filtering the data only for the above 70 departments

orders = pd.merge(orders,departments,on = 'aisle-department' )

In [141]:
len(orders)

85766

In [143]:
orders.tail(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,aisle-department,count
85761,841404,203566,train,34,5,17,15.0,49544.0,6.0,1.0,Cabernet Sauvignon Wine,28.0,5.0,redwines,alcohol,redwines alcohol,1243
85762,720314,204181,train,13,2,17,5.0,12013.0,6.0,1.0,Pinot Noir,28.0,5.0,redwines,alcohol,redwines alcohol,1243
85763,720314,204181,train,13,2,17,5.0,36419.0,5.0,1.0,Cashmere,28.0,5.0,redwines,alcohol,redwines alcohol,1243
85764,2599916,204209,train,10,5,11,17.0,21948.0,15.0,1.0,Pinot Noir Wine,28.0,5.0,redwines,alcohol,redwines alcohol,1243
85765,2223923,205403,train,7,2,14,9.0,7891.0,7.0,0.0,Pinot Noir California,28.0,5.0,redwines,alcohol,redwines alcohol,1243


In [148]:
# Reordered column has value 0 or 1, so I am updating the value to 1(no re-ordered) or 2(reordered) 
# in order to give weightage for the product that got reordered
orders['reordered'] = np.where(orders['reordered'].isnull(),0,orders['reordered'])
orders['reordered_upd'] = np.where(orders['reordered']== 0,1,2)

In [149]:
orders['reordered_upd'].value_counts()

1    47721
2    38045
Name: reordered_upd, dtype: int64

In [150]:
# I am combining the order_qty and user's reordering preference 
# to calculate the orders (orders of product that got reordered is doubled)

orders['orders'] = orders['add_to_cart_order'] * orders['reordered_upd']

### Creating sparse matrix for User and product orders

In [151]:
user_products = orders.pivot_table(index='user_id',columns= 'product_name', aggfunc={'orders':'sum'} , fill_value=0)


In [152]:
sparse_pivot = sparse.csr_matrix(user_products)

In [153]:
user_product_recommender = pw.cosine_similarity(sparse_pivot)

recommender_df = pd.DataFrame(user_product_recommender, 
                                  columns=user_products.index.values,
                                  index=user_products.index.values)

In [154]:
# Matrix of User_id

recommender_df.head()

Unnamed: 0,2,5,8,14,18,21,23,29,38,41,...,206171,206174,206181,206186,206188,206191,206195,206196,206200,206205
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
selected_user_id = 204181                 # 204181
selected_product_name = 'Pinot Noir Wine'    # Pinot Noir Wine

### Similar Users of similar products

In [159]:
cosine_df = pd.DataFrame(recommender_df[selected_user_id].sort_values(ascending=False))
cosine_df.reset_index(level=0, inplace=True)
cosine_df.columns = ['user_id','cosine_sim']
cosine_df[:6]

Unnamed: 0,user_id,cosine_sim
0,204181,1.0
1,159127,0.768221
2,132696,0.768221
3,191650,0.768221
4,191809,0.768221
5,26797,0.768221


In [160]:
orders[orders['user_id']==159127].T

Unnamed: 0,85632
order_id,2644769
user_id,159127
eval_set,train
order_number,4
order_dow,4
order_hour_of_day,9
days_since_prior_order,30
product_id,12013
add_to_cart_order,5
reordered,0


### Finding recommended products

In [168]:
similar_usr = list(cosine_df['user_id'][:5].values)
## Comparing reviews with similar users
similar_usr_df = user_products.T[[selected_user_id] + similar_usr].fillna(0)
similar_usr_df['mean_rev'] = similar_usr_df[similar_usr].mean(numeric_only=True,axis=1)
similar_usr_df.sort_values('mean_rev', ascending=False,inplace = True)

In [164]:
similar_usr

[204181, 159127, 132696, 191650, 191809]

In [None]:
For the selected product 'Pinot Noir' and for the user_id '204181', below are the recommendations 

In [167]:
similar_usr_df[:5]

Unnamed: 0_level_0,user_id,204181,204181,159127,132696,191650,191809,mean_rev
Unnamed: 0_level_1,product_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orders,Pinot Noir,12,12,5,12,16,10,16
orders,Cashmere,10,10,0,0,0,0,10
orders,#2 Mechanical Pencils,0,0,0,0,0,0,0
orders,Original Ultra Concentrated Dish Soap,0,0,0,0,0,0,0
orders,Original Ultra Concentrated Ultra Concentrated,0,0,0,0,0,0,0
