In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Step 1: Load Data
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")


In [None]:
# Step 2: Merge Data
merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left")


In [None]:
# Step 3: Feature Engineering
# Recency (days since last transaction)
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['Recency'] = (pd.to_datetime("today") - merged_data['TransactionDate']).dt.days


In [None]:
# Aggregate transactional features
customer_features = merged_data.groupby('CustomerID').agg({
    'Recency': 'min',  # Most recent transaction
    'TransactionID': 'count',  # Number of transactions
    'TotalValue': 'sum',  # Total spending
    'Category': lambda x: x.value_counts().idxmax(),  # Favorite product category
    'ProductID': 'nunique'  # Product diversity
}).rename(columns={
    'TransactionID': 'Frequency',
    'TotalValue': 'Monetary',
    'ProductID': 'ProductDiversity',
    'Category': 'FavoriteCategory'
}).reset_index()


In [None]:
# Add customer demographic features
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID', how='left')


In [None]:
# Encode categorical features
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(customer_features[['FavoriteCategory', 'Region']]).toarray()


In [None]:
# Scale numerical features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features[['Recency', 'Frequency', 'Monetary', 'ProductDiversity']])


In [None]:
# Combine scaled and encoded features
final_features = np.hstack((scaled_features, encoded_categories))


In [None]:
# Step 4: Clustering
# Use KMeans to group similar customers
kmeans = KMeans(n_clusters=5, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(final_features)


In [None]:
# Step 5: Dimensionality Reduction with PCA
pca = PCA(n_components=10)
reduced_features = pca.fit_transform(final_features)


In [None]:
# Generate pairwise data for supervised learning
pairs = list(combinations(range(len(customer_features)), 2))
pairwise_data = []
pairwise_labels = []

for i, j in pairs:
    # Compute feature differences
    diff = abs(reduced_features[i] - reduced_features[j])
    pairwise_data.append(diff)

    # Label pairs as similar (1) if in the same cluster, else 0
    label = 1 if customer_features['Cluster'][i] == customer_features['Cluster'][j] else 0
    pairwise_labels.append(label)

pairwise_data = np.array(pairwise_data)
pairwise_labels = np.array(pairwise_labels)

In [None]:
# Step 7: Train Random Forest Classifier
X_train, X_test, y_train, y_test = train_test_split(pairwise_data, pairwise_labels, test_size=0.2, random_state=42)


In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [None]:
# Step 8: Generate Lookalike Recommendations
lookalike_results = {}

for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):  # First 20 customers
    similarities = []

    for j, other_id in enumerate(customer_features['CustomerID']):
        if customer_id != other_id:
            # Predict similarity score
            diff = abs(reduced_features[idx] - reduced_features[j]).reshape(1, -1)
            similarity_score = model.predict_proba(diff)[0][1]  # Probability of being similar
            similarities.append((other_id, similarity_score))

    # Sort by similarity score and take top 3
    top_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
    lookalike_results[customer_id] = top_similar


**Cosine Similarity Matrix**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assuming `final_features` contains the scaled and encoded features of customers
# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(final_features)

# Convert the matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display a sample of the cosine similarity matrix
print(cosine_sim_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.140798  0.575716  0.619134  0.546139  0.583463   
C0002       0.140798  1.000000  0.549323  0.202675  0.536970  0.122220   
C0003       0.575716  0.549323  1.000000  0.566025  0.103078  0.575516   
C0004       0.619134  0.202675  0.566025  1.000000  0.151819  0.940268   
C0005       0.546139  0.536970  0.103078  0.151819  1.000000  0.106122   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.551047  0.297231  0.102469  0.141796  ...  0.590151  0.978354   
C0002       0.532674  0.561611  0.074302  0.105272  ...  0.136154  0.113224   
C0003       0.135356  0.557051  0.101167  0.121282  ...  0.561100  0.581254   
C0004       0.158042  0.440744  0.132328  0.200227  ...  0.968488  0.552478   
C0005  

In [None]:
# Step 9: Save Results to Lookalike.csv
lookalike_data = []

for cust_id, lookalikes in lookalike_results.items():
    lookalike_data.append({
        'cust_id': cust_id,
        'lookalikes': [(lookalike[0], round(lookalike[1], 4)) for lookalike in lookalikes]
    })

import csv
with open('Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['cust_id', 'lookalikes'])
    for row in lookalike_data:
        writer.writerow([row['cust_id'], row['lookalikes']])