In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# Load customer and transaction data
customers = pd.read_csv('C:/Users/sures/Downloads/Customers.csv')  # Assuming customers.csv contains customer data
transactions = pd.read_csv('C:/Users/sures/Downloads/Transactions.csv')  # Assuming transactions.csv contains customer transaction data

In [24]:
# Handling missing data (if needed)
customers = customers.fillna(0)  # For example, filling missing values with 0

# Feature engineering
# Example: creating a transaction matrix based on product categories
transaction_matrix = pd.pivot_table(transactions, index='CustomerID', columns='ProductID', aggfunc='sum', fill_value=0)

# Normalize transaction data
scaler = StandardScaler()
normalized_data = scaler.fit_transform(transaction_matrix)

In [25]:
# Calculate the cosine similarity between all customers
similarity_matrix = cosine_similarity(normalized_data)

# Convert the similarity matrix into a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=transaction_matrix.index, columns=transaction_matrix.index)

In [26]:
similarity_matrix

array([[ 1.        , -0.05084889, -0.06130396, ..., -0.04040134,
         0.28199452, -0.06851341],
       [-0.05084889,  1.        , -0.03604631, ..., -0.00895378,
        -0.03668435, -0.04178098],
       [-0.06130396, -0.03604631,  1.        , ..., -0.02656805,
        -0.04736234, -0.04930843],
       ...,
       [-0.04040134, -0.00895378, -0.02656805, ...,  1.        ,
        -0.01464664,  0.33446638],
       [ 0.28199452, -0.03668435, -0.04736234, ..., -0.01464664,
         1.        , -0.05460612],
       [-0.06851341, -0.04178098, -0.04930843, ...,  0.33446638,
        -0.05460612,  1.        ]])

In [27]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.050849,-0.061304,-0.079125,-0.052660,-0.064371,0.161627,-0.075040,-0.046351,-0.051970,...,0.068045,-0.058740,-0.059086,0.339435,-0.080742,-0.052822,-0.053926,-0.040401,0.281995,-0.068513
C0002,-0.050849,1.000000,-0.036046,-0.052885,-0.025248,-0.035412,-0.024578,0.273161,-0.014135,-0.029158,...,-0.040793,-0.030546,-0.033482,-0.050928,-0.051263,-0.028170,-0.026714,-0.008954,-0.036684,-0.041781
C0003,-0.061304,-0.036046,1.000000,0.053436,0.236492,-0.045712,-0.033978,0.182628,-0.031099,-0.036997,...,-0.049894,-0.041431,-0.042116,-0.058103,-0.058433,-0.037369,-0.037823,-0.026568,-0.047362,-0.049308
C0004,-0.079125,-0.052885,0.053436,1.000000,0.089804,-0.065855,-0.053722,-0.002890,-0.072055,0.041711,...,-0.066018,-0.063929,-0.058331,0.031303,-0.067767,-0.055967,-0.061571,-0.070104,0.132798,-0.061229
C0005,-0.052660,-0.025248,0.236492,0.089804,1.000000,-0.033107,-0.020320,-0.058004,0.001966,-0.028018,...,-0.041404,0.227374,-0.032608,0.156290,-0.057070,-0.025149,-0.021030,0.009359,-0.034287,-0.044525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.052822,-0.028170,-0.037369,-0.055967,-0.025149,0.219080,-0.024795,-0.052681,-0.012300,-0.029976,...,-0.042255,-0.030950,-0.034485,-0.053463,-0.053824,1.000000,-0.026798,-0.006627,0.197934,-0.043582
C0197,-0.053926,-0.026714,-0.037823,-0.061571,-0.021030,0.253587,-0.022140,-0.057739,-0.002294,-0.029257,...,-0.042618,-0.028236,-0.033928,-0.056987,0.115046,-0.026798,1.000000,0.004752,-0.036072,-0.045270
C0198,-0.040401,-0.008954,-0.026568,-0.070104,0.009359,-0.014174,0.000570,-0.064650,0.053781,-0.014639,...,-0.029113,-0.002975,-0.018512,-0.055707,-0.056319,-0.006627,0.004752,1.000000,-0.014647,0.334466
C0199,0.281995,-0.036684,-0.047362,0.132798,-0.034287,-0.047004,-0.033038,0.009684,-0.021127,-0.038584,...,-0.053637,-0.040913,-0.044239,0.188789,0.097450,0.197934,-0.036072,-0.014647,1.000000,-0.054606


In [28]:
lookalike_dict = {}

# For each customer, get the top 3 most similar customers
for cust_id in range(1, 21):  # For customers C0001 to C0020
    similar_customers = similarity_df.loc[f'C{cust_id:04d}'].sort_values(ascending=False).iloc[1:4]
    lookalike_dict[f'C{cust_id:04d}'] = [(cust, score) for cust, score in zip(similar_customers.index, similar_customers.values)]

# Convert the dictionary to a DataFrame
lookalike_df = pd.DataFrame([(key, value[0][0], value[0][1]) for key, value in lookalike_dict.items()], columns=["CustomerID", "LookalikeID", "SimilarityScore"])
lookalike_df.to_csv('Lookalike.csv', index=False)


In [29]:
lookalike_df

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0020,0.372517
1,C0002,C0030,0.346793
2,C0003,C0181,0.451287
3,C0004,C0070,0.308543
4,C0005,C0096,0.465014
5,C0006,C0040,0.441139
6,C0007,C0079,0.542171
7,C0008,C0091,0.277383
8,C0009,C0083,0.549309
9,C0010,C0094,0.490772


In [30]:
lookalike_df.to_csv

<bound method NDFrame.to_csv of    CustomerID LookalikeID  SimilarityScore
0       C0001       C0020         0.372517
1       C0002       C0030         0.346793
2       C0003       C0181         0.451287
3       C0004       C0070         0.308543
4       C0005       C0096         0.465014
5       C0006       C0040         0.441139
6       C0007       C0079         0.542171
7       C0008       C0091         0.277383
8       C0009       C0083         0.549309
9       C0010       C0094         0.490772
10      C0011       C0135         0.423595
11      C0012       C0164         0.398012
12      C0013       C0169         0.398822
13      C0014       C0128         0.811582
14      C0015       C0073         0.650129
15      C0016       C0187         0.543028
16      C0017       C0071         0.472096
17      C0018       C0115         0.318400
18      C0019       C0064         0.395094
19      C0020       C0026         0.563001>