In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
# Merge datasets
transactions = transactions.merge(products, on='ProductID', how='left')
customer_transactions = transactions.groupby('CustomerID').agg({
    'Category': lambda x: ' '.join(x),  # Combine product categories for each customer
    'TotalValue': 'sum',               # Total spending by customer
    'TransactionID': 'count'           # Number of transactions
}).reset_index()

In [6]:
# Merge customer profiles with customer data
customer_data = customers.merge(customer_transactions, on='CustomerID', how='left').fillna({
    'Category': '', 'TotalValue': 0, 'TransactionID': 0
})

In [7]:
# Encode product categories and region
category_encoded = customer_data['Category'].str.get_dummies(sep=' ')
region_encoded = pd.get_dummies(customer_data['Region'], prefix="region")

# Combine all features
features = pd.concat([
    customer_data[['TotalValue', 'TransactionID']],
    category_encoded,
    region_encoded
], axis=1)

In [8]:
# Normalize numerical features
scaler = StandardScaler()
features[['TotalValue', 'TransactionID']] = scaler.fit_transform(features[['TotalValue', 'TransactionID']])

In [9]:
# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])


In [10]:
# Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_data['CustomerID'][:20]):  # Limit to the first 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalike_results[customer_id] = [(sim_cust, round(score, 4)) for sim_cust, score in similar_customers.items()]

In [11]:
# Save results to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
})
csv_file_path = 'Divyanshu_lookalike.csv'
lookalike_df.to_csv(csv_file_path, index=False)

csv_file_path
print("Divyanshu.csv has been generated successfully.")

Divyanshu.csv has been generated successfully.


In [12]:
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0152', 1.0), ('C0174', 0.9938), ('C0085', ...
1      C0002  [('C0159', 0.9799), ('C0134', 0.9595), ('C0043...
2      C0003  [('C0031', 0.9871), ('C0129', 0.981), ('C0158'...
3      C0004  [('C0012', 0.9889), ('C0102', 0.9403), ('C0113...
4      C0005  [('C0007', 0.9922), ('C0140', 0.987), ('C0177'...


In [13]:
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [('C0152', 1.0), ('C0174', 0.9938), ('C0085', ...
1       C0002  [('C0159', 0.9799), ('C0134', 0.9595), ('C0043...
2       C0003  [('C0031', 0.9871), ('C0129', 0.981), ('C0158'...
3       C0004  [('C0012', 0.9889), ('C0102', 0.9403), ('C0113...
4       C0005  [('C0007', 0.9922), ('C0140', 0.987), ('C0177'...
5       C0006  [('C0187', 0.9771), ('C0048', 0.8925), ('C0076...
6       C0007  [('C0005', 0.9922), ('C0140', 0.9786), ('C0177...
7       C0008  [('C0109', 0.9831), ('C0098', 0.9538), ('C0194...
8       C0009  [('C0198', 0.9866), ('C0132', 0.785), ('C0074'...
9       C0010  [('C0132', 0.9774), ('C0061', 0.9699), ('C0074...
10      C0011  [('C0107', 0.9996), ('C0126', 0.9542), ('C0192...
11      C0012  [('C0004', 0.9889), ('C0148', 0.9368), ('C0102...
12      C0013  [('C0087', 0.995), ('C0155', 0.9931), ('C0099'...
13      C0014  [('C0060', 0.9994), ('C0089', 0.9205), ('C0097...
14      C0015  [('C0131',