#### Import Necessary Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

#### Load Datasets

In [2]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

#### Datasets Structure

In [3]:
customers_df.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products_df.head(5)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions_df.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


#### Create customer feature vectors

In [6]:
def create_customer_features(customers_df, transactions_df, products_df):
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    # Calculate customer transaction features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  
        'TotalValue': ['sum', 'mean', 'std'],  
        'Quantity': ['sum', 'mean'] 
    }).fillna(0)
    
    # Flatten column names
    transaction_features.columns = ['_'.join(col).strip() for col in transaction_features.columns.values]
    
    # Calculate category preferences
    # Merge transactions with products to get categories
    trans_products = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_preferences = pd.crosstab(
        trans_products['CustomerID'], 
        trans_products['Category'],
        values=trans_products['TotalValue'],
        aggfunc='sum'
    ).fillna(0)
    
    # Calculate recency and account age
    latest_transaction = transactions_df.groupby('CustomerID')['TransactionDate'].max()
    first_transaction = transactions_df.groupby('CustomerID')['TransactionDate'].min()
    
    recency = (datetime.now() - latest_transaction).dt.days
    account_age = (latest_transaction - first_transaction).dt.days
    
    # Create region dummies
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')
    
    # Combine all features
    customer_features = pd.concat([
        transaction_features,
        category_preferences,
        pd.DataFrame({
            'recency': recency,
            'account_age': account_age
        }),
        region_dummies
    ], axis=1).fillna(0)
    return customer_features

#### calculate  cosine similarity scores between target customer and all other customers

In [7]:
def calculate_similarity_scores(customer_features, target_customer_id, n_recommendations=3):
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(customer_features)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(features_scaled)
    
    # Get target customer index
    target_idx = customer_features.index.get_loc(target_customer_id)
    
    # Get similarity scores for target customer
    similarity_scores = similarity_matrix[target_idx]
    
    # Create DataFrame with customer IDs and similarity scores
    similar_customers = pd.DataFrame({
        'CustomerID': customer_features.index,
        'SimilarityScore': similarity_scores
    })
    
    # Remove the target customer
    similar_customers = similar_customers[similar_customers['CustomerID'] != target_customer_id]
    
    # Sort by similarity score and get top N recommendations
    top_similar = similar_customers.nlargest(n_recommendations, 'SimilarityScore')
    return top_similar

### Create customer features

In [8]:
customer_features = create_customer_features(customers_df, transactions_df, products_df)

### Generate lookalike recommendations for first 20 customers

In [9]:
lookalike_results = []
for customer_id in customers_df['CustomerID'][:20]:  # First 20 customers
    similar_customers = calculate_similarity_scores(customer_features, customer_id)
    recommendations = []
    for _, row in similar_customers.iterrows():
        recommendations.append(f"{row['CustomerID']}({row['SimilarityScore']:.3f})")
    
    lookalike_results.append({
        'CustomerID': customer_id,
        'Recommendations': ' | '.join(recommendations)
    })

### Create final output DataFrame

In [10]:
lookalike_df = pd.DataFrame(lookalike_results)

### Display lookalikes for the first 20 customers

In [13]:
print(lookalike_df.head(3))

  CustomerID                             Recommendations
0      C0001  C0069(0.970) | C0183(0.918) | C0072(0.916)
1      C0002  C0036(0.920) | C0134(0.895) | C0133(0.869)
2      C0003  C0166(0.947) | C0007(0.932) | C0026(0.894)


#### Save to CSV

In [12]:
lookalike_df.to_csv('Lookalike.csv', index=False)