**Import the necessary libraries**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

**Load data into pandas dataframe**

In [2]:
recom_df = pd.read_csv(r'D:\MACHINE LEARNING\Mentorship program\ML Project2\recom.csv',index_col=0)

In [3]:
recom_df.head()

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount,ItemKey
0,90fada91,264f7a69,2022-10-07 20:53:49.153,125.0,5002.0,1.0,5002.0
1,9006f9ac,45c7d853,2022-09-17 15:54:57.187,19.0,35012.0,1.0,
2,32270891,61ad76dd,2022-11-28 13:51:55.667,141.0,5005.0,1.0,5005.0
3,97e03e47,41ee09f6,2022-09-12 16:20:22.110,4.5,35078.5,1.0,
4,41949228,244fe6d8,2022-10-14 18:53:43.933,129.5,49291.5,5.0,


In [4]:
# To check the dataset for each user(Main_ID) separately
recom_df.sort_values(by='Main_ID')

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount,ItemKey
31628,00024de6,9016611c,2022-09-03 15:12:52.607,6.0,35097.0,2.0,
37021,00084856,3a8c5a03,2022-10-21 16:40:07.373,24.0,49292.0,1.0,
37004,00084856,cb1b492d,2022-11-05 15:08:11.743,22.0,49291.5,1.0,
18346,0008e848,382c1a6b,2022-09-13 19:34:10.853,13.0,40002.0,1.0,
3927,00096930,3bc7a8f5,2022-11-25 20:13:57.747,48.0,49291.5,2.0,
...,...,...,...,...,...,...,...
39799,fff905d0,09b9a492,2022-10-17 13:45:20.843,24.0,49292.0,1.0,
13259,fff9726b,de2af9b8,2022-11-10 11:35:57.950,66.0,45001.5,1.0,45001.5
38966,fff9726b,4ca53dbb,2022-09-23 15:50:50.903,9.0,30003.0,1.0,30003.0
35259,fffa332b,0ff3f909,2022-09-25 11:31:19.700,58.0,49556.0,1.0,


In [5]:
recom_df.duplicated().sum()

171

In [6]:
recom_df['Transaction_ID'].value_counts()

9e0891a8    3
e2fdce56    3
78b64973    3
4fd75519    3
e71b5fae    3
           ..
96a11862    1
adc7d3dc    1
9a510f4e    1
0f1112fd    1
0ff8b41f    1
Name: Transaction_ID, Length: 48403, dtype: int64

In [7]:
recom_df[recom_df['Transaction_ID']=='e2fdce56']

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount,ItemKey
14083,4e1ade63,e2fdce56,2022-10-28 11:58:18.400,531.5,5009.0,1.0,5009.0
14586,4e1ade63,e2fdce56,2022-10-28 11:58:18.400,531.5,5009.0,1.0,5009.0
26563,4e1ade63,e2fdce56,2022-10-28 11:58:18.400,531.5,10023.0,1.0,10023.0


In [8]:
recom_df.drop_duplicates(inplace=True)

In [9]:
recom_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49829 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Main_ID         49829 non-null  object 
 1   Transaction_ID  49829 non-null  object 
 2   Date            49829 non-null  object 
 3   Price           49829 non-null  float64
 4   Code_Product    49829 non-null  float64
 5   Amount          49829 non-null  float64
 6   ItemKey         28512 non-null  float64
dtypes: float64(4), object(3)
memory usage: 3.0+ MB


In [10]:
# Since ItemKey and Code_Product columns are same with some of the values are missing in ItemKey. 
# Hence we kept the Code+Product and dropped the ItemKey.
# recom_df['ItemKey'].unique()

In [11]:
# recom_df['ItemKey1']=recom_df['ItemKey'].fillna(recom_df['Code_Product'])

In [12]:
# recom_df['ItemKeydiff']=recom_df['ItemKey1']-recom_df['Code_Product']

In [13]:
# recom_df['ItemKeydiff'].unique()

In [14]:
recom_df.drop('ItemKey',axis=1,inplace=True)

In [15]:
recom_df

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount
0,90fada91,264f7a69,2022-10-07 20:53:49.153,125.0,5002.0,1.0
1,9006f9ac,45c7d853,2022-09-17 15:54:57.187,19.0,35012.0,1.0
2,32270891,61ad76dd,2022-11-28 13:51:55.667,141.0,5005.0,1.0
3,97e03e47,41ee09f6,2022-09-12 16:20:22.110,4.5,35078.5,1.0
4,41949228,244fe6d8,2022-10-14 18:53:43.933,129.5,49291.5,5.0
...,...,...,...,...,...,...
49995,bb127ffb,4e0eb5ab,2022-09-24 21:48:20.847,111.5,45004.0,4.0
49996,a8bc484a,c9946c16,2022-11-18 19:49:01.973,34.0,49292.0,1.0
49997,c983862a,d1a35c5c,2022-11-24 20:02:43.023,178.0,5001.5,1.0
49998,8821da12,66f9b474,2022-11-06 13:07:01.423,26.0,49291.5,1.0


In [16]:
recom_df.sort_values(by='Main_ID',inplace=True)

In [17]:
recom_df.reset_index(drop=True)

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount
0,00024de6,9016611c,2022-09-03 15:12:52.607,6.0,35097.0,2.0
1,00084856,3a8c5a03,2022-10-21 16:40:07.373,24.0,49292.0,1.0
2,00084856,cb1b492d,2022-11-05 15:08:11.743,22.0,49291.5,1.0
3,0008e848,382c1a6b,2022-09-13 19:34:10.853,13.0,40002.0,1.0
4,00096930,3bc7a8f5,2022-11-25 20:13:57.747,48.0,49291.5,2.0
...,...,...,...,...,...,...
49824,fff905d0,09b9a492,2022-10-17 13:45:20.843,24.0,49292.0,1.0
49825,fff9726b,de2af9b8,2022-11-10 11:35:57.950,66.0,45001.5,1.0
49826,fff9726b,4ca53dbb,2022-09-23 15:50:50.903,9.0,30003.0,1.0
49827,fffa332b,0ff3f909,2022-09-25 11:31:19.700,58.0,49556.0,1.0


In [18]:
# recom_df[recom_df['Code_Product']==5002.5]

In [19]:
# recom_df.set_index('Main_ID')

In [20]:
d= pd.pivot_table(index='Main_ID',columns='Code_Product',values='Amount', data=recom_df) 
d

Code_Product,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
Main_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00024de6,,,,,,,,,,,...,,,,,,,,,,
00084856,,,,,,,,,,,...,,,,,,,,,,
0008e848,,,,,,,,,,,...,,,,,,,,,,
00096930,,,,,,,,,,,...,,,,,,,,,,
000c66b7,,,,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff8b1c4,,,,,,,,,,,...,,,,,,,,,,
fff905d0,,,,,,,,,,,...,,,,,,,,,,
fff9726b,,,,,,,,,,,...,,,,,,,,,,
fffa332b,,,,,,,,,,,...,,,,,,,,,,


**Calculating the pearson correlation**

In [21]:
d.corr(method='pearson')

Code_Product,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
Code_Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5000.5,1.000000,0.041565,-0.086369,0.259041,,,,-0.161382,-0.076829,-0.292770,...,,,,,,,,,,
5001.0,0.041565,1.000000,0.124233,-0.111430,,,,-0.301398,0.569109,-0.333333,...,,,,,,,,,,
5001.5,-0.086369,0.124233,1.000000,-0.097222,,,,,,1.000000,...,,,,,,,,,,
5002.0,0.259041,-0.111430,-0.097222,1.000000,,,,0.496139,,,...,,,,,,,,,,
5002.5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200045.5,,,,,,,,,,,...,,,,,,1.0,,,,
200046.0,,,,,,,,,,,...,,,,,,,,,,
200046.5,,,,,,,,,,,...,,,,,,,,,,
200047.0,,,,,,,,,,,...,,,,,,,,,,


In [22]:
#A list of 20 products to recommend to a user who has purchased'5002.0'
# d.corr(method='pearson')[5002.0].sort_values(ascending=False).iloc[:20]

**We have to recommend multiple items for each user**<br>A list is created containing users and the items purchased by the user




In [23]:
# lst1 = list(zip(recom_df['Main_ID'], recom_df['Code_Product']))
lst=list(set(list(zip(recom_df['Main_ID'], recom_df['Code_Product']))))
# lst

**Converting the list into dictionary containing multiple items purchased by each user**

In [24]:
Code_Product_dict = dict()

for line in lst:
    if line[0] in Code_Product_dict:
        # append the new number to the existing array at this slot
        Code_Product_dict[line[0]].append(line[1])
    else:
        # create a new array in this slot
        Code_Product_dict[line[0]] = [line[1]]

In [25]:
Code_Product_dict

{'f3d03d7a': [5000.5, 30026.5, 48556.5, 25003.0, 5011.5],
 '34d5fd08': [49292.0, 10023.0],
 'e2637be7': [10023.0, 10015.0],
 'c7e95da6': [45004.0, 5009.0, 5002.0, 49427.5],
 '29a921bc': [5025.0],
 'ea785e04': [35005.0, 5000.5, 49129.0],
 '89726537': [10015.0, 45003.5, 30008.0, 45000.5, 45004.0],
 'a180aff9': [35000.5],
 '9a2886b4': [25003.0],
 'db555906': [10032.5],
 'c0f5501d': [49291.5, 45001.5],
 '310fc119': [49291.5, 5000.5],
 'ea248aa6': [40028.5],
 'ec55ffc8': [40009.5, 10004.0, 40020.5, 45001.0, 5001.0, 10023.0],
 'e709f216': [5027.5,
  5011.5,
  45000.5,
  45004.0,
  48513.0,
  10022.5,
  49291.5,
  35001.0],
 '0032f70b': [5009.0],
 '6827331a': [48504.5, 5011.5, 45004.0, 49291.5, 35078.5],
 '8bfeeb44': [10004.0, 45004.0, 49556.0, 49292.0],
 '9be83098': [5000.5, 5025.0],
 '180e5ee3': [49291.5],
 'f404d05d': [10004.0],
 '3392c111': [5000.5, 49356.0, 5009.0, 5013.0],
 '74790659': [49291.5],
 '856e70a6': [45001.0, 5009.0, 35000.5, 30003.5, 40019.0],
 '7cf424e6': [40002.0, 40028.5, 

In [26]:
# len(Code_Product_dict)

In [27]:
# list(Code_Product_dict.keys())[0]

In [28]:
# def product_recommend(name):
#     if name in Code_Product_dict.keys():
#         index = list(Code_Product_dict).index(name)
#         print(index)
#         for j in range(len(list(Code_Product_dict.values())[index])):
#             dfr=pd.DataFrame(d.corr(method='pearson')[list(Code_Product_dict.values())[index][j]].sort_values(ascending=False).iloc[:11])
#             dfr.reset_index(inplace=True)
#             print('\n')
#             print(list(Code_Product_dict.keys())[index])
#             dfr_filtered = dfr[dfr['Code_Product'] != list(Code_Product_dict.values())[index][j]]
    
#             print(dfr_filtered)
#     else:
#         print('user does not exist')

code_pcoeff: empty list containing code_product and their pearson coefficients.<br> Function created to pass Main_Id as a parameter to obtain recommendation for given Main_Id(User).<br> Check initiated to accept only valid Main_Id via first If else statement.<br> First for loop:loop through each code_product for a given user and provide recommendation for each code_product<br>len(list(Code_Product_dict.values())[index]:contains length of the list of code_products against the index of Main_ID<br>dfr is dataframe containing correlation of code_products, all the NaN values are filled with zero<br> Reset_index is done to decrease the multilevel indexing<br>dfr_filtered: exclude the correlation of a code_product with itself<br> Second for loop: loop through the first two recommended code_products against each code_product purchased by Main_ID and append the corresponding code_products with their pearson cofficients in  code_pcoeff.<br>Seconf If else statement: To provide maximum of 10 recommendations to any user.<br>first_data_code_pcoeff: only extarct the code_product not the pearson coeffient.

In [29]:
def product_recommend(name):
    code_pcoeff=[]
    
    if name in Code_Product_dict.keys():
        index = list(Code_Product_dict).index(name)
        print(index)
        for j in range(len(list(Code_Product_dict.values())[index])):
            dfr=pd.DataFrame(d.corr(method='pearson').fillna(0)[list(Code_Product_dict.values())[index][j]].round(3).sort_values(ascending=False).iloc[:5])
            dfr.reset_index(inplace=True)
#             print('\n')
#             print(list(Code_Product_dict.keys())[index])
            dfr_filtered = dfr[dfr['Code_Product'] != list(Code_Product_dict.values())[index][j]]
            dfr_filtered.reset_index(drop=True, inplace=True)
            for i in range(2):
                cp=tuple(list(dfr_filtered.iloc[i]))
                code_pcoeff.append(cp)
    else:
        return 'user does not exist'
        
        
    if len(code_pcoeff)>10:
        code_pcoeff=code_pcoeff[slice(10)]
    else:
        code_pcoeff=code_pcoeff 
        
        
#     print(code_pcoeff)   
    first_data_code_pcoeff = [t[0] for t in code_pcoeff]   
    return 'List of items purchased by user',list(Code_Product_dict.values())[index],'List of items recommended to user', first_data_code_pcoeff

In [30]:
product_recommend('361e3b9f')

15699


('List of items purchased by user',
 [48513.0, 45001.0],
 'List of items recommended to user',
 [30026.5, 15002.0, 15004.0, 30009.5])

In [31]:
# recom_df[recom_df['Main_ID']=='361e3b9f']

In [32]:
# dfr=pd.DataFrame(d.corr(method='pearson').fillna(0)[48513.0].sort_values(ascending=False).iloc[:5])
# dfr.reset_index(inplace=True)

In [33]:
# list(set(list(Code_Product_dict.values())[12]))[0]

In [34]:
# recom_df[recom_df['Main_ID']=='021bfefb']

In [35]:
# dd=pd.DataFrame(np.diag(d2), index=[d2.index, d2.columns])
# dd

In [36]:
# dd[0].sum()