In [None]:
# Install necessary libraries
!pip install pandas matplotlib seaborn scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score



In [None]:
# Dataset URLs
customers_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

# Load datasets
customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

# Inspect the data
print(customers.head())
print(products.head())
print(transactions.head())

# Check for missing values
print("Missing values in Customers:\n", customers.isnull().sum())
print("Missing values in Products:\n", products.isnull().sum())
print("Missing values in Transactions:\n", transactions.isnull().sum())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [4]:
## Task 2: Lookalike Model
# Create a pivot table
pivot_table = merged_data.pivot_table(index='CustomerID', columns='ProductName', values='TotalValue', aggfunc='sum', fill_value=0)

# Compute cosine similarity
similarity_matrix = cosine_similarity(pivot_table)
similarity_df = pd.DataFrame(similarity_matrix, index=pivot_table.index, columns=pivot_table.index)

# Function to get top n similar customers
def get_top_lookalikes(customer_id, n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:n+1]
    return [(idx, score) for idx, score in similar_customers.items()]

# Generate lookalike recommendations
lookalike_recommendations = {}
for customer in customers['CustomerID'][:20]:
    lookalike_recommendations[customer] = get_top_lookalikes(customer)

# Convert to DataFrame
lookalike_data = []
for customer_id, recommendations in lookalike_recommendations.items():
    row = {'CustomerID': customer_id}
    for i, (lookalike_id, score) in enumerate(recommendations):
        row[f'Lookalike{i+1}'] = lookalike_id
        row[f'Score{i+1}'] = score
    lookalike_data.append(row)

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved.")


Lookalike recommendations saved.
