In [55]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with customers and products for enriched data
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Create a customer profile feature set
customer_profiles = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',          # Total spending
    'Quantity': 'sum',            # Total quantity purchased
    'ProductID': 'count',         # Product diversity (proxy for frequency)
    'Region': 'first'             # Region information
}).reset_index()

# Rename 'ProductID' aggregation for clarity
customer_profiles.rename(columns={'ProductID': 'ProductCount'}, inplace=True)

# One-hot encode the 'Region' column
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Standardise numerical features for similarity calculation
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'ProductCount']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Compute Cosine Similarity
# Extract features for similarity calculation
features = customer_profiles.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(features)

# Build the Lookalike Recommendations
def get_top_3_lookalikes(customer_id, customer_profiles, similarity_matrix):
    # Get the index of the given customer
    customer_index = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    
    # Fetch similarity scores for this customer
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    
    # Exclude the customer themselves and sort by similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = [score for score in similarity_scores if score[0] != customer_index]
    
    # Get the top 3 similar customers
    top_3 = similarity_scores[:3]
    top_3_customers = [
        (customer_profiles.iloc[entry[0]]['CustomerID'], entry[1]) for entry in top_3
    ]
    return top_3_customers

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customer_profiles['CustomerID'][:20]:
    lookalike_results[customer_id] = get_top_3_lookalikes(customer_id, customer_profiles, similarity_matrix)

# Create a Lookalike.csv file
lookalike_data = []
for customer_id, lookalikes in lookalike_results.items():
    lookalike_data.append({
        'cust_id': customer_id,
        'lookalike1_id': lookalikes[0][0],
        'score1': lookalikes[0][1],
        'lookalike2_id': lookalikes[1][0],
        'score2': lookalikes[1][1],
        'lookalike3_id': lookalikes[2][0],
        'score3': lookalikes[2][1],
    })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the Lookalike DataFrame
print(lookalike_df)


   cust_id lookalike1_id    score1 lookalike2_id    score2 lookalike3_id  \
0    C0001         C0107  0.989362         C0137  0.987831         C0191   
1    C0002         C0142  0.990076         C0043  0.975826         C0186   
2    C0003         C0190  0.917195         C0133  0.912769         C0174   
3    C0004         C0113  0.994264         C0165  0.985470         C0102   
4    C0005         C0123  0.999781         C0078  0.998505         C0097   
5    C0006         C0168  0.953841         C0048  0.907481         C0187   
6    C0007         C0140  0.997960         C0092  0.997904         C0078   
7    C0008         C0084  0.926548         C0109  0.926376         C0090   
8    C0009         C0198  0.987412         C0060  0.972610         C0014   
9    C0010         C0166  0.964472         C0199  0.941327         C0073   
10   C0011         C0107  0.985363         C0048  0.985288         C0001   
11   C0012         C0102  0.987787         C0155  0.985784         C0104   
12   C0013  