In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import datetime as dt
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler



In [2]:
# reading the Dataset
Customer_df = pd.read_csv("Customers.csv")
print(Customer_df.head())
print("*****************")
products_df = pd.read_csv("Products.csv")
print(products_df.head())
print("*****************")
Transaction_df = pd.read_csv("Transactions.csv")
print(Transaction_df.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
*****************
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
*****************
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-

In [5]:
# Merge the datasets to link customer and transaction data
Data  =Transaction_df.merge(Customer_df,on='CustomerID').merge(products_df,on='ProductID')

In [7]:
# Feature Engineering: Create a matrix of customers and their total purchase values by product category
pivot_data = Data.pivot_table(index='CustomerID', columns='Category', values='TotalValue', aggfunc='sum', fill_value=0)


In [14]:
# Normalize the data
scaler = StandardScaler()
pivot_data_scaled = pd.DataFrame(scaler.fit_transform(pivot_data), columns=pivot_data.columns, index=pivot_data.index)
pivot_data_scaled

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,-0.842360,-0.899788,1.995322,-0.435631
C0002,-0.955802,0.205200,-0.944670,0.099592
C0003,-0.955802,-0.767938,0.495742,0.579208
C0004,0.913582,-0.899788,0.465108,1.704505
C0005,-0.955802,-0.899788,0.282758,0.120489
...,...,...,...,...
C0196,0.341615,0.808522,-0.944670,1.674496
C0197,-0.955802,-0.899788,0.006718,0.321982
C0198,-0.955802,0.075226,-0.916604,-0.955679
C0199,-0.955802,-0.899788,-0.326599,0.789789


In [31]:
# Calculate Cosine Similarity between customers
cosine_sim = cosine_similarity(pivot_data_scaled)
print(cosine_sim[0][1])
cosine_sim_df = pd.DataFrame(cosine_sim, index=pivot_data.index, columns=pivot_data.index)

-0.40221486386819344


In [18]:
cosine_sim_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.402215,0.64835,0.043313,0.661203,-0.960708,0.637812,-0.268011,0.171019,-0.381244,...,-0.059019,0.830892,-0.38249,0.576188,-0.030011,-0.719949,0.461473,-0.173126,0.165667,-0.756913
C0002,-0.402215,1.0,0.175482,-0.446094,0.257825,0.235584,0.166689,0.470266,0.588281,0.70398,...,-0.527737,-0.050379,-0.374197,0.036706,0.426021,0.311911,0.409603,0.762741,0.522735,0.116516
C0003,0.64835,0.175482,1.0,0.328565,0.932178,-0.73467,0.996881,0.202597,0.198752,-0.3721,...,-0.448319,0.462407,-0.605579,0.316061,0.610345,-0.14611,0.919496,-0.06446,0.840167,-0.867465
C0004,0.043313,-0.446094,0.328565,1.0,0.092857,-0.005891,0.347577,0.112209,-0.725347,-0.913367,...,0.068221,-0.451726,0.104261,-0.37971,0.422922,0.433344,0.165504,-0.839495,0.330655,-0.335804
C0005,0.661203,0.257825,0.932178,0.092857,1.0,-0.814067,0.945695,-0.065768,0.508871,-0.144782,...,-0.240979,0.657526,-0.431373,0.22012,0.34906,-0.392847,0.967956,0.21394,0.817298,-0.918261


In [24]:
# Function to get the top 3 lookalikes for a given customer
def get_lookalikes(customer_id, top_n=3):
    sim_scores = cosine_sim_df[customer_id].sort_values(ascending=False)
    similar_customers = sim_scores.drop(customer_id).head(top_n)
    return similar_customers.index.tolist(), similar_customers.values.tolist()
    # Generate Lookalike recommendations for customers C0001 to C0020
lookalikes = {}
for customer_id in Customer_df['CustomerID'][:20]:  # First 20 customers
    lookalikes[customer_id] = get_lookalikes(customer_id)
    
# Store the results in a DataFrame for output
lookalike_df = pd.DataFrame(columns=['CustomerID', 'Lookalike_CustomerIDs', 'Similarity_Scores'])

for customer_id, (lookalike_ids, scores) in lookalikes.items():
    lookalike_df = lookalike_df._append({'CustomerID': customer_id,
                                        'Lookalike_CustomerIDs': ', '.join(lookalike_ids),
                                        'Similarity_Scores': ', '.join(map(str, scores))}, 
                                       ignore_index=True)

# Save the lookalike results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model generated successfully! Check 'Lookalike.csv' for results.")


Lookalike model generated successfully! Check 'Lookalike.csv' for results.
