#  Lookalike Model

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [7]:
customers = pd.read_csv('Data/Customers.csv')
transactions = pd.read_csv('Data/Transactions.csv')
products = pd.read_csv('Data/Products.csv')


In [8]:
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

## Load and Preprocess the Data

In [9]:
# Merge transactions with customers and products
merged = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

In [10]:
# Aggregate transaction data at the customer level
customer_profile = merged.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price_y': 'mean',
    'Category': lambda x: ','.join(x)  # Concatenate categories for analysis
}).reset_index()

In [11]:
# Add region and signup year from Customers.csv
customer_profile = customer_profile.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
customer_profile['SignupYear'] = pd.to_datetime(customers['SignupDate']).dt.year

##  Feature Engineering

In [12]:
# One-hot encode Region
region_encoded = pd.get_dummies(customer_profile['Region'], prefix='Region')

# Create category one-hot encoding from aggregated Category column
category_encoded = customer_profile['Category'].str.get_dummies(sep=',')

# Combine encoded features with numerical data
features = pd.concat([customer_profile[['Quantity', 'TotalValue', 'Price_y', 'SignupYear']],
                      region_encoded, category_encoded], axis=1)



### Normalize Features:

In [13]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

## Calculate Similarities

In [14]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Convert to a DataFrame for easier manipulation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])


## Get Top 3 Similar Customers for Each Customer:

In [15]:
# Function to get top 3 similar customers
def get_top_3_similar(customers_df, customer_id, top_n=3):
    # Get similarity scores for the given customer
    similar_scores = customers_df.loc[customer_id].sort_values(ascending=False)
    # Exclude the customer itself
    similar_scores = similar_scores[similar_scores.index != customer_id]
    # Return top N similar customers with scores
    return similar_scores.head(top_n)

In [16]:
# Generate Lookalike Map for Customers C0001 to C0020
lookalike_map = {}
for customer_id in customer_profile['CustomerID'].iloc[:20]:
    top_3 = get_top_3_similar(similarity_df, customer_id)
    lookalike_map[customer_id] = list(zip(top_3.index, top_3.values))

In [17]:
# Create a DataFrame for Lookalike Map
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(value) for value in lookalike_map.values()]
})


In [18]:
# Save as CSV
lookalike_df.to_csv('Bhavik_Nagre_Lookalike.csv', index=False)

##  Validate and Test

In [19]:
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0152', 0.9920393918649758), ('C0174', 0.96...
1      C0002  [('C0159', 0.9865445042779168), ('C0134', 0.94...
2      C0003  [('C0031', 0.9576068305908076), ('C0195', 0.94...
3      C0004  [('C0148', 0.8708532445230792), ('C0113', 0.78...
4      C0005  [('C0007', 0.971691831475196), ('C0140', 0.945...
