### Task 2: Lookalike Model 

#### Step 1 : Load and Merge Data

In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

###### Pandas and NumPy: Used for data manipulation and numerical operations.
###### MinMaxScaler: Normalizes data to a specific range, ensuring consistent feature scales.
###### Cosine Similarity: Measures similarity between vectors, useful for recommendation systems.

In [45]:
#Load Datasets
customers=pd.read_csv('Customers.csv')
transactions=pd.read_csv('Transactions.csv')
products=pd.read_csv('Products.csv')

###### Combines customer, transaction, and product data into a unified dataset for analysis.


In [46]:
#Merge datasets
data=transactions.merge(customers,on='CustomerID').merge(products,on='ProductID')

In [47]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [48]:
# Aggregate data for customer profiles
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first',
    'Category': lambda x: x.mode()[0]
}).reset_index()

###### Group by Customer:
###### Group all the data by CustomerID so you can summarize each customer's information.

###### Summarize Columns:
###### TotalValue: 'sum': Add up all the purchases to find how much money the customer spent.
###### Quantity: 'sum': Add up all the items the customer bought.
###### Region: 'first': Use the first region for each customer (assuming one region per customer).
###### Category: lambda x: x.mode()[0]: Find the product category the customer bought most often.
###### Reset Index:
###### After grouping, make sure the result looks like a normal table by resetting the index.

In [49]:
customer_profiles

Unnamed: 0,CustomerID,TotalValue,Quantity,Region,Category
0,C0001,3354.52,12,South America,Electronics
1,C0002,1862.74,10,Asia,Clothing
2,C0003,2725.38,14,South America,Home Decor
3,C0004,5354.88,23,South America,Books
4,C0005,2034.24,7,Asia,Electronics
...,...,...,...,...,...
194,C0196,4982.88,12,Europe,Home Decor
195,C0197,1928.65,9,Europe,Electronics
196,C0198,931.83,3,Europe,Clothing
197,C0199,1979.28,9,Europe,Electronics


In [50]:
#Here we having two Categorical features we need to convert them into numerical features

### Encode categorical Feautures

In [51]:
# Encode categorical features
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region', 'Category'], drop_first=True)

In [52]:
customer_profiles.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,12,False,False,True,False,True,False
1,C0002,1862.74,10,False,False,False,True,False,False
2,C0003,2725.38,14,False,False,True,False,False,True
3,C0004,5354.88,23,False,False,True,False,False,False
4,C0005,2034.24,7,False,False,False,False,True,False


#### Step 2 : Compute Similarities

In [53]:
# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['TotalValue', 'Quantity']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])


##### Scaling Features : Scaling ensures features are on a similar scale, which is crucial for cosine similarity computations.
##### MinMaxScaler: Scales values to a range of 0–1, maintaining relative distances while avoiding dominance of features with larger scales.

In [54]:
# Compute similarity
similarity_matrix = cosine_similarity(customer_profiles.drop(columns=['CustomerID']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

##### Cosine Similarity: Measures how similar two vectors (customers) are, based on their interaction patterns.
Similarity Matrix: A square matrix where each cell 
(
𝑖
,
𝑗
)
(i,j) represents the similarity between customer 
𝑖
i and 
𝑗
j.

In [55]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.098565,0.549797,0.712620,0.729625,0.739684,0.735388,0.143225,0.022072,0.079445,...,0.740941,0.993223,0.315662,0.152349,0.561693,0.117923,0.530714,0.022557,0.531184,0.172923
C0002,0.098565,1.000000,0.103737,0.207545,0.079841,0.141507,0.094806,0.145629,0.689873,0.730012,...,0.122131,0.063764,0.312713,0.150995,0.129948,0.112051,0.068236,0.690158,0.068734,0.963348
C0003,0.549797,0.103737,1.000000,0.718135,0.082111,0.736697,0.097560,0.569494,0.021799,0.085402,...,0.740904,0.528297,0.321395,0.155183,0.995333,0.552443,0.070033,0.022189,0.070555,0.175065
C0004,0.712620,0.207545,0.718135,1.000000,0.167239,0.966528,0.199424,0.300042,0.044796,0.169385,...,0.950230,0.655903,0.651958,0.314735,0.756138,0.238163,0.140844,0.045675,0.142028,0.355976
C0005,0.729625,0.079841,0.082111,0.167239,1.000000,0.120277,0.998434,0.116275,0.018162,0.064000,...,0.100652,0.721249,0.257833,0.124426,0.105073,0.097225,0.721902,0.018578,0.722199,0.141441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0.117923,0.112051,0.552443,0.238163,0.097225,0.177448,0.117592,0.575350,0.488047,0.533849,...,0.145544,0.075615,0.372943,0.179908,0.571459,1.000000,0.529184,0.488695,0.529969,0.205602
C0197,0.530714,0.068236,0.070033,0.140844,0.721902,0.097570,0.721698,0.098562,0.501712,0.526791,...,0.083432,0.521616,0.213654,0.103146,0.088274,0.529184,1.000000,0.501930,0.999995,0.116592
C0198,0.022557,0.690158,0.022189,0.045675,0.018578,0.033849,0.022445,0.031584,0.999997,0.978965,...,0.027847,0.014472,0.071352,0.034422,0.028754,0.488695,0.501930,1.000000,0.501914,0.629253
C0199,0.531184,0.068734,0.070555,0.142028,0.722199,0.098672,0.722147,0.099343,0.501689,0.526927,...,0.084235,0.521835,0.215715,0.104138,0.089033,0.529969,0.999995,0.501914,1.000000,0.117764


#### Step 3 : Get Top 3 Lookalikes

In [56]:
# Get top 3 similar customers for each target customer
lookalike_results = {}

for customer in customer_profiles['CustomerID'][:20]:  # For C0001 - C0020
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]
    lookalike_results[customer] = [(similar_customer, score) for similar_customer, score in zip(similar_customers.index, similar_customers.values)]

# Save results to CSV
lookalike_df = pd.DataFrame({
    'cust_id': lookalike_results.keys(),
    'List<cust_id,score>': [str(v) for v in lookalike_results.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)


##### The top 3 most similar customers for each of the first 20 customers in the dataset based on a similarity matrix. For each target customer, it retrieves the top 3 customers with the highest similarity scores (excluding themselves), stores these results in a dictionary, and converts the dictionary into a DataFrame. Finally, it saves the DataFrame as a CSV file named Lookalike.csv, where each row contains a customer ID and their top similar customers with corresponding similarity scores.

In [58]:
# It stores the data in the Lookalike.csv