In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Load datasets
customers = pd.read_csv("/content/drive/MyDrive/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Transactions.csv")

In [6]:
# Merge transactions with products and customers
data = transactions.merge(products, on='ProductID', how='left')  # Adds product details
data = data.merge(customers, on='CustomerID', how='left')  # Adds customer details

# Check column names and rename the appropriate 'Price' column
print(data.columns)  # Debugging step to see all columns

# Decide which Price column to keep (let's assume 'Price_y' from Products.csv is correct)
data.rename(columns={'Price_y': 'Price'}, inplace=True)

# Drop the redundant Price column if not needed
if 'Price_x' in data.columns:
    data.drop(columns=['Price_x'], inplace=True)

# Aggregate transaction data per customer
customer_features = data.groupby('CustomerID').agg({
    'Quantity': 'sum',              # Total quantity purchased
    'TotalValue': 'sum',            # Total spending
    'Price': 'mean',                # Average product price
    'ProductID': pd.Series.nunique, # Number of unique products purchased
    'Category': pd.Series.nunique,  # Number of unique categories
    'Region': 'first'               # Region of the customer
}).reset_index()

# Display the first few rows of aggregated data
print(customer_features.head())


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')
  CustomerID  Quantity  TotalValue       Price  ProductID  Category  \
0      C0001        12     3354.52  278.334000          5         3   
1      C0002        10     1862.74  208.920000          4         2   
2      C0003        14     2725.38  195.707500          4         3   
3      C0004        23     5354.88  240.636250          8         3   
4      C0005         7     2034.24  291.603333          3         2   

          Region  
0  South America  
1           Asia  
2  South America  
3  South America  
4           Asia  


In [7]:
# Encode categorical features (Region)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Quantity', 'TotalValue', 'Price', 'ProductID', 'Category']
customer_features_scaled = customer_features.copy()
customer_features_scaled[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [10]:
# Compute similarity matrix
feature_matrix = customer_features_scaled[numerical_features + list(customer_features_scaled.columns[6:])]
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Recommend top 3 lookalikes for each customer (C0001-C0020)
lookalikes = {}
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self-similarity
    lookalikes[customer_id] = [(other_id, round(score, 4)) for other_id, score in similar_customers.items()]

# Save results to Lookalike.csv
lookalike_output = []
for customer_id, recommendations in lookalikes.items():
    for rec_customer_id, score in recommendations:
        lookalike_output.append({'CustomerID': customer_id, 'LookalikeID': rec_customer_id, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_output)

# Specify the file path in Google Drive
output_path = '/content/drive/My Drive/Lookalike.csv'

# Save the Lookalike.csv file to the specified path
lookalike_df.to_csv(output_path, index=False)

print(f"Lookalike.csv has been saved to {output_path}")

print("Lookalike.csv has been created with the top 3 lookalikes for each of the first 20 customers.")


Lookalike.csv has been saved to /content/drive/My Drive/Lookalike.csv
Lookalike.csv has been created with the top 3 lookalikes for each of the first 20 customers.
