**Import Dependencies**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

**Load and Preprocess Data**

In [None]:

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

print("Data loaded successfully!")
print(f"Number of customers: {len(customers)}")
print(f"Number of products: {len(products)}")
print(f"Number of transactions: {len(transactions)}")

Data loaded successfully!
Number of customers: 200
Number of products: 100
Number of transactions: 1000


**Feature Engineering**

In [None]:
def create_customer_features(customers, transactions):
    """customer features for modeling"""

    transaction_features = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'Quantity': ['sum', 'mean', 'std'],
        'TotalValue': ['sum', 'mean', 'std']
    }).reset_index()

    transaction_features.columns = [
        'CustomerID', 'transaction_count',
        'total_quantity', 'avg_quantity', 'std_quantity',
        'total_value', 'avg_value', 'std_value'
    ]

    latest_transaction = transactions.groupby('CustomerID')['TransactionDate'].max()
    overall_latest = latest_transaction.max()
    recency = (overall_latest - latest_transaction).dt.days
    transaction_features['recency'] = recency

    transaction_dates = transactions.groupby('CustomerID')['TransactionDate'].agg(list)
    def calc_avg_time_between(dates):
        if len(dates) < 2:
            return 0
        dates = sorted(dates)
        return np.mean([(dates[i+1] - dates[i]).days for i in range(len(dates)-1)])

    transaction_features['avg_time_between'] = transaction_dates.apply(calc_avg_time_between)

    customer_features = customers.merge(transaction_features, on='CustomerID', how='left')

    customer_features = customer_features.fillna(0)

    return customer_features

customer_features = create_customer_features(customers, transactions)

print("Feature Summary:")
print(customer_features.describe())

Feature Summary:
                SignupDate  transaction_count  total_quantity  avg_quantity  \
count                  200         200.000000      200.000000    200.000000   
mean   2023-07-19 08:31:12           5.000000       12.685000      2.518142   
min    2022-01-22 00:00:00           0.000000        0.000000      0.000000   
25%    2022-09-26 12:00:00           3.000000        8.000000      2.191667   
50%    2023-08-31 12:00:00           5.000000       12.000000      2.500000   
75%    2024-04-12 12:00:00           6.000000       17.000000      3.000000   
max    2024-12-28 00:00:00          11.000000       32.000000      4.000000   
std                    NaN           2.221412        6.201457      0.588127   

       std_quantity   total_value    avg_value    std_value  recency  \
count    200.000000    200.000000   200.000000   200.000000    200.0   
mean       0.998592   3449.977800   684.142281   438.331177      0.0   
min        0.000000      0.000000     0.000000     0.00

**Lookalike Model Implementation**

In [None]:
class LookalikeModel:
    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_matrix = None
        self.customer_ids = None
        self.features = None

    def fit(self, customer_features):
        """Fit the lookalike model"""

        self.features = [
            'transaction_count', 'total_quantity', 'avg_quantity',
            'std_quantity', 'total_value', 'avg_value', 'std_value',
            'recency', 'avg_time_between'
        ]


        self.feature_matrix = self.scaler.fit_transform(
            customer_features[self.features]
        )
        self.customer_ids = customer_features['CustomerID'].values

    def find_lookalikes(self, customer_id, n_recommendations=3):
        """Find lookalike customers"""
        customer_idx = np.where(self.customer_ids == customer_id)[0][0]
        similarities = cosine_similarity(
            self.feature_matrix[customer_idx].reshape(1, -1),
            self.feature_matrix
        )[0]


        similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]

        return [
            {
                'similar_customer': self.customer_ids[idx],
                'similarity_score': similarities[idx]
            }
            for idx in similar_indices
        ]

model = LookalikeModel()
model.fit(customer_features)

**Generate Recommendations**

In [None]:

recommendations = []
first_20_customers = customers['CustomerID'][:20]

for customer_id in first_20_customers:
    lookalikes = model.find_lookalikes(customer_id)


    similar_ids = [l['similar_customer'] for l in lookalikes]
    similarity_scores = [l['similarity_score'] for l in lookalikes]

    recommendations.append({
        'customer_id': customer_id,
        'similar_customer_ids': ','.join(similar_ids),
        'similarity_scores': ','.join([f'{score:.4f}' for score in similarity_scores])
    })


recommendations_df = pd.DataFrame(recommendations)

print("\nLookalike Recommendations")
print(recommendations_df.head())


Lookalike Recommendations
  customer_id similar_customer_ids     similarity_scores
0       C0001    C0043,C0125,C0193  0.7667,0.7189,0.6850
1       C0002    C0031,C0121,C0094  0.9927,0.9311,0.9296
2       C0003    C0144,C0176,C0018  0.9257,0.8814,0.7561
3       C0004    C0109,C0017,C0075  0.9645,0.9629,0.9485
4       C0005    C0130,C0150,C0131  0.9774,0.9762,0.9732


**Save Results**

In [None]:

recommendations_df.to_csv('Revanth_Pasupuleti_Lookalike.csv', index=False)
print("Recommendations saved to Revanth_Pasupuleti_Lookalike.csv")


print("\nSample of saved format:")
for _, row in recommendations_df.head().iterrows():
    print(f"\nCustomer ID: {row['customer_id']}")
    similar_ids = row['similar_customer_ids'].split(',')
    scores = row['similarity_scores'].split(',')
    print("Similar Customers (ID, Score):")
    for id, score in zip(similar_ids, scores):
        print(f"  {id}: {score}")

Recommendations saved to Revanth_Pasupuleti_Lookalike.csv

Sample of saved format:

Customer ID: C0001
Similar Customers (ID, Score):
  C0043: 0.7667
  C0125: 0.7189
  C0193: 0.6850

Customer ID: C0002
Similar Customers (ID, Score):
  C0031: 0.9927
  C0121: 0.9311
  C0094: 0.9296

Customer ID: C0003
Similar Customers (ID, Score):
  C0144: 0.9257
  C0176: 0.8814
  C0018: 0.7561

Customer ID: C0004
Similar Customers (ID, Score):
  C0109: 0.9645
  C0017: 0.9629
  C0075: 0.9485

Customer ID: C0005
Similar Customers (ID, Score):
  C0130: 0.9774
  C0150: 0.9762
  C0131: 0.9732
