In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Merge data
merged_data = transactions_df.merge(customers_df, on='CustomerID', how='left')
merged_data = merged_data.merge(products_df, on='ProductID', how='left')

# Feature Engineering: Create aggregated customer features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean'),
    unique_products=('ProductName', 'nunique'),
    unique_categories=('Category', 'nunique'),
    recency=('TransactionDate', lambda x: (pd.Timestamp('today') - pd.to_datetime(x).max()).days)
).reset_index()

# Fill missing values and scale features
customer_features.fillna(0, inplace=True)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(
    customer_features[['total_spent', 'total_transactions', 'avg_transaction_value',
                       'unique_products', 'unique_categories', 'recency']]
)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Lookalike Recommendations
lookalike_dict = {}
top_n = 200  # Get top 5 lookalikes

for customer_idx, customer_id in enumerate(customer_features['CustomerID']):
    similarities = similarity_matrix[customer_idx]
    # Exclude self and sort by similarity
    similar_customers_idx = np.argsort(similarities)[::-1][1:top_n + 1]
    similar_customers = customer_features.iloc[similar_customers_idx]
    scores = similarities[similar_customers_idx]

    # Create recommendations with scores
    lookalike_dict[customer_id] = [
        (similar_customers.iloc[i]['CustomerID'], scores[i]) for i in range(len(similar_customers))
    ]

# Flatten into a DataFrame
flattened_lookalike_data = []
for customer_id, recommendations in lookalike_dict.items():
    for i, (lookalike_id, score) in enumerate(recommendations):
        flattened_lookalike_data.append({
            'CustomerID': customer_id,
            'Lookalike_CustomerID': lookalike_id,
            'Score': score,
            'Recommendation_Rank': i + 1
        })

lookalike_df = pd.DataFrame(flattened_lookalike_data)

# Save and display the lookalike recommendations
lookalike_df.to_csv('Enhanced_Lookalike.csv', index=False)
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalike_CustomerID,Score,Recommendation_Rank
0,C0001,C0086,0.92256,1
1,C0001,C0056,0.911202,2
2,C0001,C0190,0.906501,3
3,C0001,C0189,0.904509,4
4,C0001,C0174,0.891723,5


In [13]:
lookalike_df.head(100)

Unnamed: 0,CustomerID,Lookalike_CustomerID,Score,Recommendation_Rank
0,C0001,C0086,0.922560,1
1,C0001,C0056,0.911202,2
2,C0001,C0190,0.906501,3
3,C0001,C0189,0.904509,4
4,C0001,C0174,0.891723,5
...,...,...,...,...
95,C0001,C0068,0.155243,96
96,C0001,C0049,0.153240,97
97,C0001,C0022,0.140774,98
98,C0001,C0046,0.140062,99


In [14]:

# Let's filter the data for the first 20 customers (C0001 to C0020)
lookalike_df_filtered = lookalike_df[lookalike_df['CustomerID'].isin([f'C{i:04}' for i in range(1, 21)])]

# Initialize an empty dictionary to store the results
lookalike_dict = {}

# For each customer (C0001 to C0020), sort the lookalikes by similarity score and select the top 3
for customer in lookalike_df_filtered['CustomerID'].unique():
    # Get the lookalikes for the current customer
    customer_lookalikes = lookalike_df_filtered[lookalike_df_filtered['CustomerID'] == customer]
    
    # Sort the lookalikes by similarity score in descending order
    customer_lookalikes_sorted = customer_lookalikes.sort_values(by='Score', ascending=False)
    
    # Select the top 3 lookalikes (if available)
    top_3_lookalikes = customer_lookalikes_sorted.head(3)
    
    # Store the results in the dictionary
    lookalike_dict[customer] = list(zip(top_3_lookalikes['Lookalike_CustomerID'], top_3_lookalikes['Score']))


# Prepare the columns based on the top 3 lookalikes and scores
columns = ['Lookalike_1', 'Score_1', 'Lookalike_2', 'Score_2', 'Lookalike_3', 'Score_3']
lookalike_data = []

for customer, lookalikes in lookalike_dict.items():
    # Flatten the list of lookalikes and their scores
    flat_data = []
    for lookalike, score in lookalikes:
        flat_data.extend([lookalike, round(score, 2)])  # Add Lookalike ID and Score to the list
    lookalike_data.append(flat_data)

# Convert the lookalike data into a DataFrame
lookalike_df_final = pd.DataFrame(lookalike_data, index=lookalike_dict.keys(), columns=columns)

# Save the result as Lookalike.csv
lookalike_df_final.to_csv('Lookalike.csv', header=True)

print("Lookalike.csv has been created successfully.")


Lookalike.csv has been created successfully.
