In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
recom_df = pd.read_csv('recom.csv')

# **Exploratory Data Analysis**

In [4]:
recom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      50000 non-null  int64  
 1   Main_ID         50000 non-null  object 
 2   Transaction_ID  50000 non-null  object 
 3   Date            50000 non-null  object 
 4   Price           50000 non-null  float64
 5   Code_Product    50000 non-null  float64
 6   Amount          50000 non-null  float64
 7   ItemKey         28597 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 3.1+ MB


In [5]:
recom_df = recom_df.drop('Unnamed: 0', axis = 1) # this is just a duplicate of index column and we will drop it

In [6]:
recom_df.sample(20)

Unnamed: 0,Main_ID,Transaction_ID,Date,Price,Code_Product,Amount,ItemKey
31619,a11c3386,2b6c8f0c,2022-08-29 16:09:17.640,21.0,49291.5,1.0,
14598,976c2ff8,ab70f614,2022-09-25 16:40:11.907,7.0,40009.5,2.0,
5680,c159f4fd,a1446aaf,2022-10-07 11:56:16.517,52.0,10013.0,1.0,10013.0
18297,ca30093c,01418ba8,2022-11-20 22:09:20.783,165.5,48513.0,1.0,
19841,9a61779a,7476131b,2022-11-12 14:08:19.590,89.0,5025.0,1.0,5025.0
9983,c2afcc39,7eb3b61c,2022-10-24 12:18:30.617,13.0,45004.0,1.0,45004.0
7623,babc1d56,e8ac5807,2022-12-01 14:57:45.500,18.5,45004.0,1.0,45004.0
4324,d530d3df,12b1df66,2022-10-09 16:10:19.440,11.5,40021.5,1.0,40021.5
43458,6cdb187d,785c2a6f,2022-11-04 16:00:37.907,52.5,49292.0,1.0,
39576,e987cc20,fcfb7439,2022-09-08 15:56:34.590,36.0,30026.5,1.0,30026.5


In [7]:
recom_df['Main_ID'].nunique() # number of unique users

28514

In [8]:
recom_df['Transaction_ID'].nunique() # multiple products may have been bougth with one transaction. 
                                    # But most transactions probably have only one product

48403

In [9]:
recom_df.groupby('Transaction_ID')['Code_Product'].count().describe()

count    48403.000000
mean         1.032994
std          0.184651
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: Code_Product, dtype: float64

**It seems at least 75% of transactions have bought one product with smallers fractions of transactions having up to 3 products. This generates co-purchase signals. However, with an average of 1.03, co-purchases are relatively rare**

### **Does one product map to multiple item keys?**

In [10]:
recom_df['Code_Product'].nunique()

333

In [11]:
recom_df['ItemKey'].nunique()

152

In [12]:
recom_df[['Code_Product', 'ItemKey']].drop_duplicates().head(10)

Unnamed: 0,Code_Product,ItemKey
0,5002.0,5002.0
1,35012.0,
2,5005.0,5005.0
3,35078.5,
4,49291.5,
6,5012.0,5012.0
7,49292.0,
9,5011.5,5011.5
10,5025.0,5025.0
11,10032.5,10032.5


In [13]:
recom_df.groupby('Code_Product')['ItemKey'].nunique().describe() 

count    333.000000
mean       0.456456
std        0.498850
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: ItemKey, dtype: float64

In [14]:
recom_df = recom_df.drop('ItemKey', axis = 1) #It seems 'ItemKey' is simply a partial copy of 'Code_Product'

**Exploratory analysis revealed that 'ItemKey' mirrors 'Code_Product' where available but contains a substantial number of missing values. Therefore, Code_Product was used as the unique item identifier to ensure full data coverage**

### **Creating a new dataframe with purchase count for products**

In [15]:
user_item_count_df = recom_df.groupby(['Main_ID','Code_Product'])['Transaction_ID']\
                            .count()\
                            .reset_index(name='Purchase_Count')

In [16]:
user_item_count_df.sample(10)

Unnamed: 0,Main_ID,Code_Product,Purchase_Count
6975,264248be,5000.5,1
10081,36ae9fa1,49292.0,1
12778,45f3b09a,49291.5,3
7838,2b1e2259,30014.0,1
23379,81cd4d76,49291.5,1
34437,c1ec0fd4,5027.0,1
6630,24919e7e,10003.0,1
10327,37fbea85,49356.0,1
15115,53774a49,49129.0,1
9156,321b7eb6,40069.5,1


### **Pivot Table**

In [17]:
pivot_table = user_item_count_df.pivot_table(index = 'Main_ID',
                                             columns = 'Code_Product',
                                             values = 'Purchase_Count',
                                             fill_value = 0)

In [18]:
pivot_table

Code_Product,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
Main_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00024de6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00084856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0008e848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00096930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000c66b7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff8b1c4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff905d0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff9726b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffa332b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Cosine Similarity**

In [19]:
item_similarity = cosine_similarity(pivot_table.T)

In [20]:
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index = pivot_table.columns, 
                                  columns = pivot_table.columns)

In [21]:
item_similarity_df

Code_Product,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
Code_Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5000.5,1.000000,0.191281,0.132201,0.170314,0.0,0.013488,0.003334,0.066335,0.065775,0.043908,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5001.0,0.191281,1.000000,0.061417,0.133634,0.0,0.006876,0.000000,0.041405,0.105375,0.039498,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5001.5,0.132201,0.061417,1.000000,0.053683,0.0,0.015038,0.011152,0.013584,0.031428,0.025916,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5002.0,0.170314,0.133634,0.053683,1.000000,0.0,0.012020,0.000000,0.043429,0.046053,0.013810,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5002.5,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200045.5,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.25,0.0,0.0,1.0,0.0,0.0,0.0,0.0
200046.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.091287,0.0,0.00,0.0,0.0,0.0,1.0,0.0,0.0,0.0
200046.5,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0
200047.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
def recommend_items(user_id, table, similarity_df, n):
    user_purchases = table.loc[user_id]
    bought_items = user_purchases[user_purchases > 0].index
    scores = similarity_df[bought_items].sum(axis = 1)
    scores = scores.drop(bought_items)
    return scores.sort_values(ascending = False).head(n)

In [23]:
user_id = pivot_table.index[0]

In [24]:
recommend_items(user_id, pivot_table, item_similarity_df, 5)

Code_Product
35049.0    0.179605
35001.0    0.153596
35096.5    0.145919
30026.5    0.081931
35079.5    0.074836
dtype: float64

In [25]:
recommendations = {}

for user in pivot_table.index:
    recommendations[user] = recommend_items(user, 
                                            pivot_table, 
                                            item_similarity_df, 
                                            5).index.tolist()

In [26]:
recommendations_df = pd.DataFrame.from_dict(recommendations, 
                                            orient = 'index', 
                                            columns = ['rec1', 'rec2', 'rec3', 'rec4', 'rec5'])

In [27]:
recommendations_df

Unnamed: 0,rec1,rec2,rec3,rec4,rec5
00024de6,35049.0,35001.0,35096.5,30026.5,35079.5
00084856,5000.5,45004.0,48513.0,25003.0,10013.0
0008e848,48523.5,5000.5,48522.5,35071.5,15000.5
00096930,5000.5,49292.0,45004.0,48513.0,25003.0
000c66b7,5000.5,45004.0,10015.0,5001.0,25003.0
...,...,...,...,...,...
fff8b1c4,35076.0,35106.0,40001.5,35078.0,35096.5
fff905d0,49291.5,5000.5,45004.0,10013.0,25003.0
fff9726b,5000.5,45004.0,5001.0,5009.0,25003.0
fffa332b,5000.5,48513.0,25003.0,49291.5,49292.0


In [28]:
recommendations_df.reset_index(inplace = True)

In [29]:
recommendations_df.rename(columns = {'index':'Main_ID'}, inplace = True)

In [30]:
recommendations_df

Unnamed: 0,Main_ID,rec1,rec2,rec3,rec4,rec5
0,00024de6,35049.0,35001.0,35096.5,30026.5,35079.5
1,00084856,5000.5,45004.0,48513.0,25003.0,10013.0
2,0008e848,48523.5,5000.5,48522.5,35071.5,15000.5
3,00096930,5000.5,49292.0,45004.0,48513.0,25003.0
4,000c66b7,5000.5,45004.0,10015.0,5001.0,25003.0
...,...,...,...,...,...,...
28509,fff8b1c4,35076.0,35106.0,40001.5,35078.0,35096.5
28510,fff905d0,49291.5,5000.5,45004.0,10013.0,25003.0
28511,fff9726b,5000.5,45004.0,5001.0,5009.0,25003.0
28512,fffa332b,5000.5,48513.0,25003.0,49291.5,49292.0


In [31]:
Since explicit ratings were unavailable, an implicit feedback approach was adopted. Purchase frequency was used as a proxy for user preference by counting the number of transactions per user–item pair.
Code_Product was selected as the item identifier since it contains no missing values and uniquely represents each product, ensuring data completeness and stable item representation in the recommendation system.

SyntaxError: invalid character '–' (U+2013) (3544863800.py, line 1)