In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import ast
import pickle
import os

from helper import one_hot

In [2]:
os.listdir()

['api.ipynb',
 'Supervised Learning.ipynb',
 'Recommender_prep.ipynb',
 '__pycache__',
 'possible_ids.p',
 'Exploration.ipynb',
 'cleaned_data.p',
 'helper.py',
 '.ipynb_checkpoints',
 'recommender_data.p',
 'Clustering.ipynb']

In [3]:
case = pickle.load(open('cleaned_data.p',"rb"))
possible_IDs = pickle.load(open('possible_ids.p',"rb"))

In [4]:
list_of_ids = case.product_ids.to_list()

In [5]:
prod_ids_list = [eval(x) for x in list_of_ids]

In [6]:
possible_IDs

[7881, 10834, 61603, 62040, 63288, 125830]

In [7]:
prod_ids_list

[[7881, 10834, 61603, 62040, 63288, 125830],
 [10834, 61603, 62040, 63288, 125830],
 [63288, 125830],
 [7881, 10834],
 [7881, 10834, 62040, 63288, 125830],
 [7881, 10834, 61603],
 [10834, 61603, 62040, 63288, 125830],
 [10834, 61603, 62040, 63288, 125830],
 [63288, 125830],
 [7881, 10834],
 [7881, 10834, 61603, 62040, 63288, 125830],
 [7881, 10834, 61603],
 [10834, 61603, 62040, 63288, 125830],
 [63288, 125830],
 [7881, 10834],
 [7881, 10834, 62040, 63288, 125830],
 [7881, 10834, 61603],
 [63288, 125830],
 [7881, 10834],
 [7881, 10834, 62040, 63288, 125830],
 [7881, 10834, 61603],
 [7881, 10834, 61603, 62040, 63288, 125830],
 [7881, 10834, 61603, 62040, 63288, 125830],
 [10834, 61603, 62040, 63288, 125830],
 [63288, 125830],
 [7881, 10834],
 [10834, 61603, 62040, 63288, 125830],
 [10834, 61603, 62040, 63288, 125830],
 [63288, 125830],
 [7881, 10834],
 [7881, 10834, 62040, 63288, 125830],
 [7881, 10834, 61603],
 [7881, 10834, 61603, 62040, 63288, 125830],
 [10834, 61603, 62040, 63288, 1

In [8]:
id_df = one_hot(prod_ids_list, possible_IDs)

In [9]:
possible_IDs

[7881, 10834, 61603, 62040, 63288, 125830]

In [10]:
id_df

Unnamed: 0,7881,10834,61603,62040,63288,125830
0,1,1,1,1,1,1
1,0,1,1,1,1,1
2,0,0,0,0,1,1
3,1,1,0,0,0,0
4,1,1,0,1,1,1
...,...,...,...,...,...,...
95,0,0,0,0,1,1
96,0,0,0,0,1,1
97,1,1,0,0,0,0
98,1,1,0,1,1,1


In [48]:
# turn each vector into a unit vector
magnitude = np.sqrt(np.square(id_df).sum(axis=1))

id_df = id_df.divide(magnitude, axis='index')

id_df

Unnamed: 0,7881,10834,61603,62040,63288,125830
0,0.408248,0.408248,0.408248,0.408248,0.408248,0.408248
1,0.000000,0.447214,0.447214,0.447214,0.447214,0.447214
2,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107
3,0.707107,0.707107,0.000000,0.000000,0.000000,0.000000
4,0.447214,0.447214,0.000000,0.447214,0.447214,0.447214
...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107
96,0.000000,0.000000,0.000000,0.000000,0.707107,0.707107
97,0.707107,0.707107,0.000000,0.000000,0.000000,0.000000
98,0.447214,0.447214,0.000000,0.447214,0.447214,0.447214


### Find the similarity of items based on purchase habits

$SIMILARITY(A,B) = \frac{A\cdot B}{\| \mathbf{A} \|| \mathbf{B} \|}$

In [253]:
# This takes the 100x6 matrix and transposes it into 6x100 matrix, find the dot product of  itself and it's own transpose, 
# normalized by the product of their magnitudes
# to get a 6x6 cosine similarity matrix, output values are between -1 and 1, 1 and -1 be perfect correlation, with inverse relationships

def calculate_similarity(id_df):
    """Calculate the column-wise cosine similarity. Return a new dataframe matrix with similarities.
    """
    # data_sparse = sparse.csr_matrix(id_df)
    similarities = cosine_similarity(id_df.transpose())
    sim = pd.DataFrame(data=similarities, index= id_df.columns, columns= id_df.columns)
    return sim

In [255]:
data_matrix = calculate_similarity(id_df)

In [292]:
data_matrix

Unnamed: 0,7881,10834,61603,62040,63288,125830
7881,1.0,0.917914,0.563749,0.420621,0.299054,0.299054
10834,0.917914,1.0,0.732423,0.634297,0.450973,0.450973
61603,0.563749,0.732423,1.0,0.621279,0.441718,0.441718
62040,0.420621,0.634297,0.621279,1.0,0.710981,0.710981
63288,0.299054,0.450973,0.441718,0.710981,1.0,1.0
125830,0.299054,0.450973,0.441718,0.710981,1.0,1.0


In [291]:
# Save the recommender dataframe
with open('recommender_data.p', 'wb') as file:
    pickle.dump(data_matrix, file)

Case where no items have been purchased

In [27]:
popular_items = id_df.sum(axis=0).div(100).reset_index().rename(columns={'index':'product_id',0:'score'})

popular_items_dict = popular_items.sort_values(by='score',ascending=False).reset_index(drop=True).to_dict()

In [28]:
with open('popular_items.p', 'wb') as file:
    pickle.dump(popular_items_dict, file)