In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import pickle as pkl

<font color="red">  
    <h2>1. Load the train and test datasets</h2>
</font>  

In [3]:
def load_dataset():
    """
    This function will be used to load the train and
    test datasets.
    """

    train_df = pd.read_csv('data/prepared_datasets_for_training_and_evaluation/reco_assignment_training_merged_duplicates.csv', parse_dates=['Tran_dt'])
    test_df = pd.read_csv('data/prepared_datasets_for_training_and_evaluation/reco_assignment_holdout_merged_duplicates.csv', parse_dates=['Tran_dt'])
    
    return train_df, test_df

#X_test_holdout is a holdout dataset, shall be only used for final evaluation and should not be part of the training methodologies
X_train, X_test_holdout = load_dataset()

In [4]:
#Display top 5 rows in X_train
X_train.head()

Unnamed: 0,Customer_num,Tran_dt,Product_num,Total_Tran_qty
0,C_203152,2022-01-02,P_3365,0.135
1,C_1607154,2022-01-02,P_58,1.0
2,C_84518,2022-01-02,P_21295,3.0
3,C_2553711,2022-01-02,P_20576,1.0
4,C_2376922,2022-01-02,P_1095,2.0


In [5]:
#Display top 5 rows in X_test_holdout
X_test_holdout.head()

Unnamed: 0,Customer_num,Tran_dt,Product_num,Total_Tran_qty
0,C_1967485,2022-01-02,P_4321,1.0
1,C_62367,2022-01-02,P_11092,1.0
2,C_2459818,2022-01-02,P_12854,2.0
3,C_851377,2022-01-02,P_9160,1.0
4,C_2351826,2022-01-02,P_17207,1.0


<font color="red">  
    <h1>2. Generate Product-Rankings</h1>
</font>  

# 2-(A). Get top selling products based on purchase history across all products


In [6]:
#Find the no of units solds for each product
top_selling_items_df = X_train.groupby('Product_num').agg({'Total_Tran_qty':'sum'})

#Reset the index by converting the Product_num into a column
top_selling_items_df.reset_index(inplace=True)

#Rank the product by most Total_Tran_qty purchased
top_selling_items_df['Top_Selling_Rank'] = top_selling_items_df['Total_Tran_qty'].rank(method='min',ascending=False).astype(int)
top_selling_items_df.head()

Unnamed: 0,Product_num,Total_Tran_qty,Top_Selling_Rank
0,P_10,1.0,14341
1,P_1000,40.134,2526
2,P_10002,1.38,14316
3,P_10007,3.0,10955
4,P_10008,62.0,1738


In [7]:
#List the top 5 products purchased (Can be used for Cold Start)
top_selling_items_df.sort_values('Total_Tran_qty',ascending=False).head(5)

Unnamed: 0,Product_num,Total_Tran_qty,Top_Selling_Rank
14415,P_4822,3388.715,1
6049,P_2342,2657.0,2
6048,P_2341,2629.0,3
3350,P_16717,2538.0,4
1391,P_12777,2207.96,5


# 2-(B). Get most popular products based on purchase history across all products


In [8]:
#Find the no of orders placed and the unique no of customers placed orders, of each product
most_popular_items_df = X_train.groupby('Product_num').agg({'Tran_dt':'count', 'Customer_num':'nunique'})
most_popular_items_df.columns=['No_of_Orders','No_of_Customers']

#Reset the index by converting the Product into a column
most_popular_items_df.reset_index(inplace=True)

#Products with high no of orders can be considered as most frequently purchased items
#To find the most popular items, we ill include the no of customers purchased and provide  more weightage to products purchased by more customers

#Weighted No_of_Orders (W) = O * (C / M)
#O = No_of_Orders
#C = No_of_Customers purchased the product
#M = Maximum no of customers made transactions in the entire period

O = most_popular_items_df['No_of_Orders']
C = most_popular_items_df['No_of_Customers']
M = most_popular_items_df['No_of_Customers'].max()

most_popular_items_df['Weighted_No_of_Orders'] = O * (C / M)

#Ranking the product by weighted no of orders
most_popular_items_df['Popularity_Rank'] = most_popular_items_df['Weighted_No_of_Orders'].rank(method='min',ascending=False).astype(int)
most_popular_items_df.head()

Unnamed: 0,Product_num,No_of_Orders,No_of_Customers,Weighted_No_of_Orders,Popularity_Rank
0,P_10,1,1,0.000623,13936
1,P_1000,209,195,25.392523,177
2,P_10002,2,2,0.002492,11821
3,P_10007,3,3,0.005607,10358
4,P_10008,8,8,0.039875,6589


In [9]:
#List of top 5 most popular products purchased
most_popular_items_df.sort_values('Popularity_Rank',ascending=True).head(5)

Unnamed: 0,Product_num,No_of_Orders,No_of_Customers,Weighted_No_of_Orders,Popularity_Rank
3350,P_16717,2077,1605,2077.0,1
16968,P_9168,2207,1237,1700.97134,2
16214,P_7819,1777,1302,1441.528972,3
1391,P_12777,1633,1167,1187.358879,4
2290,P_14546,1371,1039,887.519626,5


# 2-(C). Merge all the Ranks


In [10]:
#Merge Top Selling Items Rank and Popularity Rank dataframes
product_rankings_df = pd.merge(top_selling_items_df, most_popular_items_df, how='inner', on='Product_num')

# Get only the Product, Price and Rank columns
product_rankings_df = product_rankings_df[['Product_num','Top_Selling_Rank','Popularity_Rank']]

product_rankings_df.head()

Unnamed: 0,Product_num,Top_Selling_Rank,Popularity_Rank
0,P_10,14341,13936
1,P_1000,2526,177
2,P_10002,14316,11821
3,P_10007,10955,10358
4,P_10008,1738,6589


In [11]:
# List the top Product Rankings
product_rankings_df.sort_values('Popularity_Rank',ascending=True).head(5)

Unnamed: 0,Product_num,Top_Selling_Rank,Popularity_Rank
3350,P_16717,4,1
16968,P_9168,14,2
16214,P_7819,9,3
1391,P_12777,5,4
2290,P_14546,13,5


# 2-(D). Save the results


In [12]:
#Save the Product Rankings into a .csv file
product_rankings_df.to_csv('models/Product-Rankings.csv',index=False)

#Create a pickle file with the Product Rankings dataframe
pkl.dump(product_rankings_df, open('models/prod_ranking_model.pkl','wb'))

<font color="red">  
    <h1>3. Generate Customer-Product-Rankings</h1>
</font>  


# 3-(A). Products a Customer purchased the most

In [13]:
# Find the no of units sold of each product by customer
top_sell_cust_items_df = X_train.groupby(['Customer_num','Product_num']).agg({'Total_Tran_qty':'sum'})

#Reset the index by converting the Customer_num and Product_num into a column
top_sell_cust_items_df.reset_index(inplace=True)

#Rank the product by most Qty sold, at Customer level
customer_col = top_sell_cust_items_df['Customer_num']
qty_col = top_sell_cust_items_df['Total_Tran_qty'].astype(str)
top_sell_cust_items_df['Top_Selling_Rank'] = (customer_col + qty_col).rank(method='min',ascending=False).astype(int)

top_sell_cust_items_df.head()

Unnamed: 0,Customer_num,Product_num,Total_Tran_qty,Top_Selling_Rank
0,C_100082,P_10088,1.0,303994
1,C_100082,P_12661,1.0,303994
2,C_100082,P_12663,1.0,303994
3,C_100082,P_12932,1.0,303994
4,C_100082,P_13610,2.0,303991


In [14]:
# List the top 5 items purchased
top_sell_cust_items_df.sort_values('Top_Selling_Rank',ascending=True).head(5)

Unnamed: 0,Customer_num,Product_num,Total_Tran_qty,Top_Selling_Rank
304000,C_999487,P_7974,6.0,1
303928,C_999487,P_11914,4.0,2
303987,C_999487,P_456,4.0,2
303930,C_999487,P_12806,4.0,2
303993,C_999487,P_5930,4.0,2


# 3-(B). Products a Customer frequently purchased

Products with high no of orders are considered as most frequently purchased items

In [15]:
#Find the no of orders placed and the unique no of customers placed orders, of each product
freq_items_df = X_train.groupby(['Customer_num','Product_num']).agg({'Tran_dt':'count'})
freq_items_df.columns=['No_of_Orders']

# Reset the index by converting the Customer_num and Product_num into columns
freq_items_df.reset_index(inplace=True)

# Rank the product by No of Orders, at Customer Level
customer_col = freq_items_df['Customer_num']
ord_count_col = freq_items_df['No_of_Orders'].astype(str)
freq_items_df['Popularity_Rank'] = (customer_col + ord_count_col).rank(method='min',ascending=False).astype(int)

In [16]:
#List of top 5 most frequently purchased products
freq_items_df.sort_values('Popularity_Rank',ascending=True).head(5)

Unnamed: 0,Customer_num,Product_num,No_of_Orders,Popularity_Rank
303997,C_999487,P_713,3,1
303994,C_999487,P_5933,3,1
303939,C_999487,P_14838,3,1
303935,C_999487,P_14546,2,4
304000,C_999487,P_7974,2,4


# 3-(C). Merge all the Ranks

In [17]:
#Merge Top Selling Items Rank and Popularity Rank dataframes
cust_prod_rankings_df = pd.merge(top_sell_cust_items_df, freq_items_df,how='inner', on=['Customer_num','Product_num'])

#Get only the Customer, Product, and Rank columns
cust_prod_rankings_df = cust_prod_rankings_df[['Customer_num','Product_num','Total_Tran_qty','Top_Selling_Rank','No_of_Orders','Popularity_Rank']]

#Display Product Rankings DF
cust_prod_rankings_df.head(5)

Unnamed: 0,Customer_num,Product_num,Total_Tran_qty,Top_Selling_Rank,No_of_Orders,Popularity_Rank
0,C_100082,P_10088,1.0,303994,1,303989
1,C_100082,P_12661,1.0,303994,1,303989
2,C_100082,P_12663,1.0,303994,1,303989
3,C_100082,P_12932,1.0,303994,1,303989
4,C_100082,P_13610,2.0,303991,1,303989


In [18]:
# List the Product Rankings
cust_prod_rankings_df.sort_values('Popularity_Rank',ascending=True).head(5)

Unnamed: 0,Customer_num,Product_num,Total_Tran_qty,Top_Selling_Rank,No_of_Orders,Popularity_Rank
303997,C_999487,P_713,3.0,6,3,1
303994,C_999487,P_5933,1.545,26,3,1
303939,C_999487,P_14838,3.0,6,3,1
303935,C_999487,P_14546,3.0,6,2,4
304000,C_999487,P_7974,6.0,1,2,4


# 3-(D). Save the results

In [19]:
#Save the Customer-Product-Rankings into a .csv file
cust_prod_rankings_df.to_csv('models/Customer-Product-Rankings.csv',index=False)

#Save the Customer-Product-Rankings into a .pkl file
pkl.dump(cust_prod_rankings_df, open('models/cust_prod_ranking_model.pkl','wb'))

<font color="red">  
    <h1>4. Build Correlation Matrix for the Customer-Product relations (User-User based recommendation)</h1>
</font>  

# 4-(A). Preparing the corelation matrix

In [20]:
#Find the total qty purchased by each customer of each product
prod_cust_qty_df = X_train.groupby(['Product_num','Customer_num']).agg({'Total_Tran_qty':'sum'})

#Reset the index by converting the Customer_num and Product_num into columns
prod_cust_qty_df.reset_index(inplace=True)

#Find the no of unique customers purchased each product
prod_cust_count_df = X_train.groupby(['Product_num']).agg({'Customer_num':'nunique'})

#Set the customer count column
prod_cust_count_df.columns=['No_of_Customers']

#Reset the index by converting the Customer_num and Product_num into columns
prod_cust_count_df.reset_index(inplace=True)

#Merge the unique customer count and qty purchased of each product
prod_cust_df = pd.merge(prod_cust_qty_df,prod_cust_count_df,how='inner',on='Product_num')

#Create a pivot table with all Customers on columns and Products on rows, and Qty as values
prod_cust_pivot_df = prod_cust_df.pivot(index='Product_num',columns='Customer_num',values='Total_Tran_qty').fillna(0)

#Find the correlation between every two customers and build a correlation matrix using corr() method
#Used Spearman method in identifying the correlation. Pearson was not providing better results and Kendall is taking a long time for execution.
cust_correlation_df = prod_cust_pivot_df.corr(method='spearman',min_periods=5)

#Display the customer-customer corelation
cust_correlation_df.head()

Customer_num,C_100082,C_1001004,C_1001197,C_1001322,C_1001329,C_1001363,C_1001411,C_1001612,C_100187,C_1002010,...,C_998085,C_998307,C_998572,C_998684,C_998829,C_998908,C_999117,C_999155,C_999171,C_999487
Customer_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_100082,1.0,0.041741,-0.000345,-0.000913,0.02438,-0.001063,0.037296,-0.002044,-0.001892,-0.001444,...,-0.000879,-0.001244,-0.000422,-0.00117,-0.001582,-0.001291,-0.001063,-0.001035,-0.001144,-0.002172
C_1001004,0.041741,1.0,-0.000445,0.047689,0.017802,0.040651,0.058657,0.019137,0.044977,0.029109,...,-0.001136,0.070104,-0.000545,-0.001511,0.02609,-0.001667,0.124441,-0.001337,0.037492,0.038557
C_1001197,-0.000345,-0.000445,1.0,-0.000304,-0.000724,-0.000354,-0.000962,-0.000681,-0.00063,-0.000481,...,-0.000293,-0.000415,-0.000141,-0.00039,-0.000527,-0.00043,-0.000354,-0.000345,-0.000381,-0.000724
C_1001322,-0.000913,0.047689,-0.000304,1.0,-0.001915,-0.000938,0.065483,-0.001802,-0.001668,-0.001273,...,-0.000775,-0.001097,-0.000372,-0.001032,-0.001395,-0.001139,-0.000938,-0.000913,-0.001009,-0.001915
C_1001329,0.02438,0.017802,-0.000724,-0.001915,1.0,-0.002231,0.013134,0.009314,-0.00397,0.016043,...,0.029393,0.019537,-0.000886,0.020957,-0.00332,-0.00271,0.023706,-0.002172,-0.002401,0.046356


# 4-(B). Save the corelation matrix

In [21]:
#Save the Customer-Customer-Correlation-Matrix into a .csv file
cust_correlation_df.to_csv('models/Customer-Customer-Correlation-Matrix.csv')

#Save the Customer-Customer-Correlation model into a .pkl file
pkl.dump(cust_correlation_df, open('models/cust_correlation_model.pkl','wb'))

<font color="red">  
    <h1>5. Build Correlation Matrix for the Customer-Product relations (Item-Item based recommendation)</h1>
</font>  

# 5-(A). Preparing the corelation matrix

In [24]:
#Find the total qty purchased by each customer of each product
prod_cust_qty_df = X_train.groupby(['Product_num','Customer_num']).agg({'Total_Tran_qty':'sum'})

#Reset the index by converting the Party and Product into columns
prod_cust_qty_df.reset_index(inplace=True)

#Find the no of unique customers purchased each product
prod_cust_count_df = X_train.groupby(['Product_num']).agg({'Customer_num':'nunique'})

#Set the customer count column
prod_cust_count_df.columns=['No_of_Customers']

#Reset the index by converting the Party and Product into columns
prod_cust_count_df.reset_index(inplace=True)

#Merge the unique customer count and qty purchased of each product
prod_cust_df = pd.merge(prod_cust_qty_df,prod_cust_count_df,how='inner',on='Product_num')

#Create a pivot table with all Products on columns and Customers on rows, and Qty as values
prod_cust_pivot_df = prod_cust_df.pivot(index='Customer_num',columns='Product_num',values='Total_Tran_qty').fillna(0)

#Find the correlation between every two products and build a correlation matrix using corr() method
#Used Spearman method in identifying the correlation. Pearson was not providing better results and Kendall is taking a long time for execution.
prod_correlation_df = prod_cust_pivot_df.corr(method='spearman',min_periods=5)

#Display the product-product corelation matrix
prod_correlation_df.head()

Product_num,P_10,P_1000,P_10002,P_10007,P_10008,P_10009,P_1001,P_10010,P_10011,P_10012,...,P_9978,P_9979,P_998,P_9980,P_9983,P_9984,P_9988,P_999,P_9991,P_9992
Product_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P_10,1.0,0.072309,-0.000144,-0.000176,-0.000288,-0.000102,-0.000144,-0.000367,-0.000668,-0.000305,...,-0.000144,-0.000102,0.084663,-0.000144,-0.000102,-0.000102,-0.000176,-0.000102,-0.00062,-0.000585
P_1000,0.072309,1.0,-0.002028,-0.002484,-0.004058,-0.001434,-0.002028,-0.005174,-0.009424,-0.004304,...,-0.002028,-0.001434,0.068631,0.048283,-0.001434,-0.001434,-0.002484,-0.001434,0.015262,0.004382
P_10002,-0.000144,-0.002028,1.0,-0.000249,-0.000407,0.706999,-0.000203,-0.000519,-0.000945,-0.000432,...,-0.000203,-0.000144,0.058643,-0.000203,-0.000144,-0.000144,-0.000249,-0.000144,-0.000876,-0.000828
P_10007,-0.000176,-0.002484,-0.000249,1.0,-0.000498,-0.000176,-0.000249,-0.000635,-0.001158,-0.000529,...,-0.000249,-0.000176,-0.002107,-0.000249,-0.000176,-0.000176,-0.000305,-0.000176,-0.001073,-0.001014
P_10008,-0.000288,-0.004058,-0.000407,-0.000498,1.0,-0.000288,-0.000407,-0.001038,-0.001891,-0.000864,...,-0.000407,-0.000288,-0.003441,-0.000407,-0.000288,-0.000288,-0.000498,-0.000288,0.114393,-0.001656


# 5-(B). Save the corelation matrix

In [27]:
#Save the Product-Product-Correlation-Matrix into a .csv file
prod_correlation_df.to_csv('models/Product-Product-Correlation-Matrix.csv')

#Save the Product-Product-Correlation model into a .pkl file
pkl.dump(prod_correlation_df, open('models/prod_correlation_model.pkl','wb'))

<font color="red">  
    <h1>6. Assignment Tasks</h1>
</font>  

# Load the models.

In [29]:
prod_ranking_model = pkl.load(open('models/prod_ranking_model.pkl','rb'))
cust_prod_ranking_model = pkl.load(open('models/cust_prod_ranking_model.pkl','rb'))
cust_correlation_model = pkl.load(open('models/cust_correlation_model.pkl','rb'))
prod_correlation_model = pkl.load(open('models/prod_correlation_model.pkl','rb'))

# Most Popular and Top Selling Products

In [32]:
def most_popular_products(k):
    """
    This function will return the top K most popular products across all customers.
    """
    most_popular_prods = prod_ranking_model.sort_values('Popularity_Rank',ascending=True)[['Product_num']].head(k).reset_index(drop=True)
    return most_popular_prods


most_popular_prods = most_popular_products(3)
most_popular_prods.head()

Unnamed: 0,Product_num
0,P_16717
1,P_9168
2,P_7819


In [37]:
def top_selling_products(k):
    """
    This function will return the top K most purchased products across all customers.
    """
    top_sell_prods = prod_ranking_model.sort_values('Top_Selling_Rank',ascending=True)[['Product_num']].head(k).reset_index(drop=True)
    return top_sell_prods

top_sell_prods = top_selling_products(3)
top_sell_prods.head()

Unnamed: 0,Product_num
0,P_4822
1,P_2342
2,P_2341


# Customer Frequently Purchased and Purchased the Most Products

In [39]:
def cust_most_popular_table(cust_name, k):
    """
    This function will return the top K most popular products for a particular customer.
    """
    cust_most_popular_prods = cust_prod_ranking_model[cust_prod_ranking_model['Customer_num'] == cust_name]
    cust_most_popular_prods = cust_most_popular_prods.sort_values('Popularity_Rank',ascending=True)[['Product_num']].head(k).reset_index(drop=True)
    return cust_most_popular_prods

cust_name = 'C_100082'
k = 3
cust_most_popular_prods = cust_most_popular_table(cust_name, k)
cust_most_popular_prods.head()

Unnamed: 0,Product_num
0,P_10088
1,P_6437
2,P_44347


In [40]:
def cust_top_sell_table(cust_name, k):
    """
    This function will return the top K most purchased products for a particular customer.
    """
    cust_top_sell_prods = cust_prod_ranking_model[cust_prod_ranking_model['Customer_num'] == cust_name]
    cust_top_sell_prods = cust_top_sell_prods.sort_values('Top_Selling_Rank',ascending=True)[['Product_num']].head(k).reset_index(drop=True)
    return cust_top_sell_prods

cust_name = 'C_100082'
k= 3
cust_top_sell_prods = cust_top_sell_table(cust_name, k)
cust_top_sell_prods.head()

Unnamed: 0,Product_num
0,P_44347
1,P_17520
2,P_33622


## 1. What are the next 3 products to be purchased by each customer?

In [44]:
# This function performs the below functionality for the input customer
# - get the list of customers with similar purchasing pattern and correlation coefficient
# - for each customer from the list,
#   - get the products purchased
#   - multiply the purchased qty with customer correlation coefficient
# - aggregate the qty_corr by product
# - ignore the products already purchased by the input customer
# - sort them by the qty_corr
# - calls the html_code_table function to create a .html file for top 10 products customer may like

def recommend_prod_cust(cust_name, k):
    similar_custs_corr = cust_correlation_model.loc[cust_name].sort_values(ascending=False)
    
    prod_by_similar_custs = pd.DataFrame()
    
    # get the products purchased by each customer and multiply with the customer correlation coefficient
    for i in range(len(similar_custs_corr)):
        if similar_custs_corr.index[i] != cust_name:
            cust_top_sell_prods = cust_prod_ranking_model[cust_prod_ranking_model['Customer_num'] == similar_custs_corr.index[i]]
            cust_top_sell_prods = cust_top_sell_prods[['Product_num','Total_Tran_qty']].reset_index(drop=True)
            cust_top_sell_prods['Total_Tran_qty_Corr'] = cust_top_sell_prods['Total_Tran_qty'] * similar_custs_corr.iloc[i]
            prod_by_similar_custs = pd.concat([cust_top_sell_prods,prod_by_similar_custs])
    
    # aggregate the Qty Correlation by Product
    prod_by_similar_custs = prod_by_similar_custs.groupby('Product_num').agg({'Total_Tran_qty_Corr':'sum'})
    prod_by_similar_custs.reset_index(inplace=True)
    #print(prod_by_similar_custs.head(20))
    
    # ignore the products already purchased by the input customer
    # merge prod_by_similar_custs and customer purchased products and drop the rows with No_of_orders being Not Null
    input_cust_top_sell_prods = cust_prod_ranking_model[cust_prod_ranking_model['Customer_num'] == cust_name]
    df_merge = pd.merge(prod_by_similar_custs,input_cust_top_sell_prods[['Product_num','No_of_Orders']],how='left',on='Product_num')
    prod_recommend_to_cust = df_merge[df_merge['No_of_Orders'].isnull()]
    
    # sort the dataframe on Total_Tran_qty_Corr
    prod_recommend_to_cust = prod_recommend_to_cust.sort_values('Total_Tran_qty_Corr',ascending=False)[['Product_num']].head(k).reset_index(drop=True)
    return prod_recommend_to_cust
    
cust_name = 'C_1607154'
k = 3
recommend_prod_cust(cust_name, k)

Unnamed: 0,Product_num
0,P_4822
1,P_2342
2,P_1926


In [42]:
X_train

Unnamed: 0,Customer_num,Tran_dt,Product_num,Total_Tran_qty
0,C_203152,2022-01-02,P_3365,0.135
1,C_1607154,2022-01-02,P_58,1.000
2,C_84518,2022-01-02,P_21295,3.000
3,C_2553711,2022-01-02,P_20576,1.000
4,C_2376922,2022-01-02,P_1095,2.000
...,...,...,...,...
338110,C_1095013,2022-03-31,P_8370,3.000
338111,C_2533816,2022-03-31,P_38048,1.000
338112,C_243717,2022-03-31,P_17304,1.000
338113,C_2472562,2022-03-31,P_15654,1.000


## 2. Who are the top 5 similar customers to each customer?

In [51]:
cust_name = 'C_118160'
similar_cust_corr = cust_correlation_model.loc[cust_name].sort_values(ascending=False)

In [None]:
similar_custs = pd.merge(similar_prods_corr,prod_ranking_model[['Product_num']],how='left',on='Product_num')

In [None]:
drop_index = similar_prods[similar_prods['Product_num'] == prod_name].index
similar_prods.drop(index=drop_index,inplace=True)

similar_prods = similar_prods[['Product_num']].head(k).reset_index(drop=True)

In [50]:
def similar_prods(prod_name, k):
    similar_prods_corr = prod_correlation_model.loc[prod_name].sort_values(ascending=False)
    
    similar_prods = pd.merge(similar_prods_corr,prod_ranking_model[['Product_num']],how='left',on='Product_num')
    
    drop_index = similar_prods[similar_prods['Product_num'] == prod_name].index
    similar_prods.drop(index=drop_index,inplace=True)
    
    similar_prods = similar_prods[['Product_num']].head(k).reset_index(drop=True)
    
    return similar_prods

prod_name = 'P_38048'
k = 5
similar_prods = similar_prods(prod_name, k)
similar_prods.head()

Unnamed: 0,Product_num
0,P_31962
1,P_33356
2,P_34961
3,P_7539
4,P_31053


In [48]:
# This function performs the below functionality for the input product
# - get the list of products with similar purchasing pattern and correlation coefficient
# - get the price of each product from prod_ranking_model
# - drop the product in view from the list
# - sort them by the correlation coefficient
# - calls the html_code_table function to create a .html file for top 10 products similar to the product in view


def similar_customers(cust_name,k):
    similar_customers_corr = cust_correlation_model.loc[cust_name].sort_values(ascending=False)
    
    similar_customers = pd.merge(similar_customers_corr,cust_ranking_model[['Customer_num']],how='left',on='Customer_num')
    
    drop_index = similar_customers[similar_customers['Customer_num'] == cust_name].index
    similar_customers.drop(index=drop_index,inplace=True)
    
    similar_customers = similar_customers[['Customer_num']].head(k).reset_index(drop=True)
    return similar_customers

cust_name = 'C_2533816'
k = 5
similar_customers = similar_customers(cust_name,k)

NameError: name 'cust_ranking_model' is not defined

## 3. Recall calculation on the basis of the shared holdout set.