In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set the file paths (update paths to match your folder structure in Drive)
customers_path = '/content/drive/MyDrive/Customers.csv'
products_path = '/content/drive/MyDrive/Products.csv'
transactions_path = '/content/drive/MyDrive/Transactions.csv'

# Load the datasets
import pandas as pd
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)


Mounted at /content/drive


In [2]:
# Merge transactions with products to include product details
transactions = transactions.merge(products, on='ProductID')

# Merge the above with customers to have a complete dataset
full_data = transactions.merge(customers, on='CustomerID')

In [3]:
# Creating aggregated features by customer
features = full_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average transaction value
    'ProductID': pd.Series.nunique,  # Number of unique products bought
    'Category': lambda x: x.mode()[0]  # Most frequently purchased category
}).reset_index()

# Flatten the multi-level columns created by aggregation
features.columns = ['CustomerID', 'TotalSpent', 'AverageSpent', 'UniqueProducts', 'FrequentCategory']

# Encode the 'FrequentCategory' column as it's categorical
features = pd.get_dummies(features, columns=['FrequentCategory'])

In [4]:
from sklearn.preprocessing import StandardScaler

# Scaling the numeric features
scaler = StandardScaler()
features[['TotalSpent', 'AverageSpent', 'UniqueProducts']] = scaler.fit_transform(features[['TotalSpent', 'AverageSpent', 'UniqueProducts']])

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Prepare the feature matrix for similarity calculation
feature_matrix = features.drop('CustomerID', axis=1).values

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=features['CustomerID'], columns=features['CustomerID'])

In [6]:
def get_top_lookalikes(customer_id, num_lookalikes=3):
    # Get all similarities for a given customer and remove self-similarity
    all_scores = similarity_df.loc[customer_id].drop(customer_id)

    # Get top scores
    top_scores = all_scores.nlargest(num_lookalikes)

    return pd.DataFrame({
        'CustomerID': top_scores.index,
        'SimilarityScore': top_scores.values
    })

# Example usage: Get top 3 lookalikes for customer 'C0001'
top_lookalikes_c0001 = get_top_lookalikes('C0001')
print(top_lookalikes_c0001)

  CustomerID  SimilarityScore
0      C0072         0.946213
1      C0190         0.941674
2      C0069         0.910715


In [10]:
def get_top_lookalikes(customer_id, num_lookalikes=3):
    # Fetch scores for the given customer and drop self-comparison
    all_scores = similarity_df.loc[customer_id].drop(customer_id)
    # Get top scores
    top_scores = all_scores.nlargest(num_lookalikes)
    # Return list of tuples (each tuple: BaseCustomerID, LookalikeCustomerID, SimilarityScore)
    return [(customer_id, idx, score) for idx, score in top_scores.items()]

# List to collect all lookalike data
lookalike_data = []

# Get top lookalikes for selected customers and extend the list with new tuples
for customer in selected_customers:
    lookalike_data.extend(get_top_lookalikes(customer))

# Create DataFrame from the list of tuples
lookalike_df = pd.DataFrame(lookalike_data, columns=['BaseCustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)