In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime


In [3]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")
# Convert dates to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
# Merge datasets for EDA
merged = transactions.merge(products, on='ProductID', how='left')
final_merged_df = merged.merge(customers, on='CustomerID', how='left')
final_merged_df=final_merged_df.drop(columns=['Price_y'])


In [5]:

# Load the dataset
df = final_merged_df

# Convert TransactionDate and SignupDate to datetime
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['SignupDate'] = pd.to_datetime(df['SignupDate'])

# Feature Engineering: Create customer features
# 1. Total Transactions
total_transactions = df.groupby('CustomerID')['TransactionID'].count().reset_index()
total_transactions.columns = ['CustomerID', 'TotalTransactions']

# 2. Total Spending
total_spending = df.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

# 3. Recency (days since last purchase)
last_purchase = df.groupby('CustomerID')['TransactionDate'].max().reset_index()
last_purchase.columns = ['CustomerID', 'LastPurchaseDate']
last_purchase['Recency'] = (datetime.now() - last_purchase['LastPurchaseDate']).dt.days

# 4. Average Purchase Value
average_purchase_value = df.groupby('CustomerID')['TotalValue'].mean().reset_index()
average_purchase_value.columns = ['CustomerID', 'AveragePurchaseValue']

# Merge all customer features into one DataFrame
customer_features = pd.merge(total_transactions, total_spending, on='CustomerID')
customer_features = pd.merge(customer_features, last_purchase[['CustomerID', 'Recency']], on='CustomerID')
customer_features = pd.merge(customer_features, average_purchase_value, on='CustomerID')

# Create customer-product matrix
customer_product_matrix = df.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', aggfunc='sum', fill_value=0)

# Normalize customer-product matrix and customer features
scaler = StandardScaler()
customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)
customer_features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Convert scaled arrays back to DataFrames with proper column names
customer_product_matrix_scaled_df = pd.DataFrame(
    customer_product_matrix_scaled,
    index=customer_product_matrix.index,
    columns=[f"Product_{i}" for i in customer_product_matrix.columns]
)

customer_features_scaled_df = pd.DataFrame(
    customer_features_scaled,
    index=customer_features['CustomerID'],
    columns=[f"Feature_{i}" for i in range(customer_features_scaled.shape[1])]
)

# Combine customer-product matrix and customer features
combined_features = customer_product_matrix_scaled_df.join(customer_features_scaled_df)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features)

# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=combined_features.index, columns=combined_features.index)

# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers

# Get the first 20 customers
first_20_customers = combined_features.index[:20]

# Create a dictionary to store the results
lookalike_map = {}

# Generate recommendations for the first 20 customers
for customer_id in first_20_customers:
    similar_customers = get_top_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert the dictionary to a DataFrame
lookalike_df = pd.DataFrame(list(lookalike_map.items()), columns=['CustomerID', 'Lookalikes'])

# Save the results to a CSV file
lookalike_df.to_csv('Pratyush_Lal_Lookalike.csv', index=False)

# Display the results
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [(C0194, 0.40256658199067696), (C0104, 0.36422...
1      C0002  [(C0030, 0.4138229466127133), (C0091, 0.377435...
2      C0003  [(C0134, 0.46363017428728054), (C0181, 0.46192...
3      C0004  [(C0070, 0.3411582139711988), (C0175, 0.331691...
4      C0005  [(C0096, 0.4568793686981275), (C0023, 0.448645...
