In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

**Load Data**

In [3]:
customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')

**Merge Data**

In [4]:
merged_df = transactions_df.merge(customers_df, on='CustomerID', how='left') \
                           .merge(products_df, on='ProductID', how='left')

**Feature Engineering: Customer transaction behavior**

In [6]:
if not merged_df.empty:
    customer_features = merged_df.groupby('CustomerID').agg({
        'ProductID': 'nunique',  # Unique products bought
        'TotalValue': 'sum',      # Total spending
        'Quantity': 'sum',        # Total quantity purchased
        'Region': 'first',        # Region (for simplicity, we take first)
        'TransactionDate': 'count' # Number of transactions
    }).reset_index()

    # Encoding Region as a numerical feature
    if 'Region' in customer_features.columns:
        customer_features['Region'] = customer_features['Region'].astype('category').cat.codes

    # Normalize the data for similarity calculation
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(customer_features[['ProductID', 'TotalValue', 'Quantity', 'Region', 'TransactionDate']])

    # Clustering: Use KMeans to segment customers
    kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
    customer_features['Cluster'] = kmeans.fit_predict(normalized_features)

    # Compute similarity using Cosine for categorical and Euclidean for continuous features
    cosine_sim = cosine_similarity(normalized_features)

    # Function to calculate Euclidean Distance (normalized) between customers based on continuous features
    def euclidean_distance(v1, v2):
        return np.sqrt(np.sum((v1 - v2) ** 2))

    # Get top 3 lookalikes for the first 20 customers
    lookalike_dict = {}
    for customer_idx in range(min(20, len(customer_features))):
        customer_id = customer_features.iloc[customer_idx]['CustomerID']

        # Get similarity scores from the same cluster
        cluster_id = customer_features.iloc[customer_idx]['Cluster']
        cluster_customers = customer_features[customer_features['Cluster'] == cluster_id]

        # Get similarity scores based on both cosine similarity and Euclidean distance
        similarity_scores = cosine_sim[customer_idx]
        customer_data = customer_features.iloc[customer_idx][['ProductID', 'TotalValue', 'Quantity', 'Region', 'TransactionDate']].values

        euclidean_distances = np.array([euclidean_distance(customer_data, customer_features.iloc[idx][['ProductID', 'TotalValue', 'Quantity', 'Region', 'TransactionDate']].values)
                                        for idx in range(len(customer_features))])

        # Combine cosine similarity and Euclidean distance (higher weight to cosine similarity)
        combined_similarity = 0.7 * similarity_scores + 0.3 * (1 - euclidean_distances / np.max(euclidean_distances))

        # Get top 3 most similar customers (excluding the customer themselves)
        similar_customers = np.argsort(combined_similarity)[::-1][1:4]
        similar_customer_ids = customer_features.iloc[similar_customers]['CustomerID'].values
        similarity_scores_top_3 = combined_similarity[similar_customers]

        # Store the results in the dictionary
        lookalike_dict[customer_id] = {
            'Lookalike1': similar_customer_ids[0],
            'Score1': similarity_scores_top_3[0],
            'Lookalike2': similar_customer_ids[1],
            'Score2': similarity_scores_top_3[1],
            'Lookalike3': similar_customer_ids[2],
            'Score3': similarity_scores_top_3[2]
        }

    # Convert lookalike_dict to DataFrame
    lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index')

    # Save the lookalike map to CSV
    lookalike_df.to_csv('Lookalike.csv')

    print("Lookalike model created and saved to 'Lookalike.csv'")
else:
    print("Merged dataset is empty. Please check the input CSV files.")


Lookalike model created and saved to 'Lookalike.csv'
