# Step 1: Prepare the Data
# Step 1.1: Merge Customer and Transaction Data

In [4]:
import pandas as pd

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Merge Customers and Transactions data
customer_transactions = pd.merge(customers_df, transactions_df, on="CustomerID")

# Display the merged data
print(customer_transactions.head())

  CustomerID      CustomerName         Region  SignupDate TransactionID  \
0      C0001  Lawrence Carroll  South America  2022-07-10        T00015   
1      C0001  Lawrence Carroll  South America  2022-07-10        T00932   
2      C0001  Lawrence Carroll  South America  2022-07-10        T00085   
3      C0001  Lawrence Carroll  South America  2022-07-10        T00445   
4      C0001  Lawrence Carroll  South America  2022-07-10        T00436   

  ProductID      TransactionDate  Quantity  TotalValue   Price  
0      P054  2024-01-19 03:12:55         2      114.60   57.30  
1      P022  2024-09-17 09:01:18         3      412.62  137.54  
2      P096  2024-04-08 00:01:00         2      614.94  307.47  
3      P083  2024-05-07 03:11:44         2      911.44  455.72  
4      P029  2024-11-02 17:04:16         3     1300.92  433.64  


# Step 1.2: Aggregate Transaction Data 

In [5]:
# Aggregate transaction data for each customer
customer_aggregated = customer_transactions.groupby('CustomerID').agg({
    'TransactionID': 'count',  # Total number of transactions
    'TotalValue': 'sum',       # Total amount spent
    'Quantity': 'sum',         # Total quantity purchased
    'ProductID': lambda x: x.mode()[0]  # Most frequently purchased product
}).reset_index()

# Rename columns for clarity
customer_aggregated.rename(columns={
    'TransactionID': 'TotalTransactions',
    'TotalValue': 'TotalSpent',
    'Quantity': 'TotalQuantity',
    'ProductID': 'FavoriteProduct'
}, inplace=True)

# Display the aggregated data
print(customer_aggregated.head())

  CustomerID  TotalTransactions  TotalSpent  TotalQuantity FavoriteProduct
0      C0001                  5     3354.52             12            P022
1      C0002                  4     1862.74             10            P004
2      C0003                  4     2725.38             14            P002
3      C0004                  8     5354.88             23            P008
4      C0005                  3     2034.24              7            P012


# Step 1.3: Merge Aggregated Data with Customer Profiles

In [6]:
# Merge aggregated data with customer profiles
customer_profiles = pd.merge(customers_df, customer_aggregated, on="CustomerID")

# Display the final customer profiles
print(customer_profiles.head())

  CustomerID        CustomerName         Region  SignupDate  \
0      C0001    Lawrence Carroll  South America  2022-07-10   
1      C0002      Elizabeth Lutz           Asia  2022-02-13   
2      C0003      Michael Rivera  South America  2024-03-07   
3      C0004  Kathleen Rodriguez  South America  2022-10-09   
4      C0005         Laura Weber           Asia  2022-08-15   

   TotalTransactions  TotalSpent  TotalQuantity FavoriteProduct  
0                  5     3354.52             12            P022  
1                  4     1862.74             10            P004  
2                  4     2725.38             14            P002  
3                  8     5354.88             23            P008  
4                  3     2034.24              7            P012  


# Step 2: Feature Engineering
# Step 2.1: Encode Categorical Variables

In [7]:
# One-hot encode categorical variables
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=['Region', 'FavoriteProduct'])

# Display the encoded data
print(customer_profiles_encoded.head())

  CustomerID        CustomerName  SignupDate  TotalTransactions  TotalSpent  \
0      C0001    Lawrence Carroll  2022-07-10                  5     3354.52   
1      C0002      Elizabeth Lutz  2022-02-13                  4     1862.74   
2      C0003      Michael Rivera  2024-03-07                  4     2725.38   
3      C0004  Kathleen Rodriguez  2022-10-09                  8     5354.88   
4      C0005         Laura Weber  2022-08-15                  3     2034.24   

   TotalQuantity  Region_Asia  Region_Europe  Region_North America  \
0             12        False          False                 False   
1             10         True          False                 False   
2             14        False          False                 False   
3             23        False          False                 False   
4              7         True          False                 False   

   Region_South America  ...  FavoriteProduct_P076  FavoriteProduct_P078  \
0                  True  ...

# Step 2.2: Normalize Numerical Features

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['TotalSpent', 'TotalTransactions', 'TotalQuantity']
customer_profiles_encoded[numerical_features] = scaler.fit_transform(customer_profiles_encoded[numerical_features])

# Display the normalized data
print(customer_profiles_encoded.head())

  CustomerID        CustomerName  SignupDate  TotalTransactions  TotalSpent  \
0      C0001    Lawrence Carroll  2022-07-10                0.4    0.308942   
1      C0002      Elizabeth Lutz  2022-02-13                0.3    0.168095   
2      C0003      Michael Rivera  2024-03-07                0.3    0.249541   
3      C0004  Kathleen Rodriguez  2022-10-09                0.7    0.497806   
4      C0005         Laura Weber  2022-08-15                0.2    0.184287   

   TotalQuantity  Region_Asia  Region_Europe  Region_North America  \
0       0.354839        False          False                 False   
1       0.290323         True          False                 False   
2       0.419355        False          False                 False   
3       0.709677        False          False                 False   
4       0.193548         True          False                 False   

   Region_South America  ...  FavoriteProduct_P076  FavoriteProduct_P078  \
0                  True  ...

# Step 3: Calculate Similarity Scores
# Step 3.1: Compute Cosine Similarity

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Drop non-relevant columns (e.g., CustomerID, CustomerName, SignupDate)
customer_features = customer_profiles_encoded.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(customer_features)

# Convert the similarity matrix to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles_encoded['CustomerID'], columns=customer_profiles_encoded['CustomerID'])

# Display the similarity matrix
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.120054  0.571605  0.606703  0.091696  0.575453   
C0002       0.120054  1.000000  0.112033  0.187021  0.531958  0.113298   
C0003       0.571605  0.112033  1.000000  0.594021  0.084412  0.569068   
C0004       0.606703  0.187021  0.594021  1.000000  0.141085  0.597820   
C0005       0.091696  0.531958  0.084412  0.141085  1.000000  0.089783   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.103043  0.248014  0.057316  0.127132  ...  0.575940  0.552677   
C0002       0.535872  0.558183  0.043131  0.098587  ...  0.114497  0.085478   
C0003       0.095519  0.224321  0.048628  0.121449  ...  0.567259  0.547144   
C0004       0.158325  0.383382  0.086914  0.199921  ...  0.597665  0.551127   
C0005  

# Step 3.2: Find Top 3 Lookalikes

In [10]:
# Function to get top 3 lookalikes for a given customer
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    # Get the similarity scores for the given customer
    customer_similarity = similarity_df[customer_id]
    
    # Sort the similarity scores in descending order
    sorted_similarity = customer_similarity.sort_values(ascending=False)
    
    # Exclude the customer themselves and get the top N lookalikes
    top_lookalikes = sorted_similarity.iloc[1:top_n+1]
    
    return top_lookalikes

# Example: Get top 3 lookalikes for the first customer (CustomerID: C0001)
customer_id = 'C0001'
top_lookalikes = get_top_lookalikes(customer_id, similarity_df)
print(f"Top 3 lookalikes for {customer_id}:")
print(top_lookalikes)

Top 3 lookalikes for C0001:
CustomerID
C0025    0.989184
C0104    0.607483
C0102    0.607464
Name: C0001, dtype: float64


# Step 4: Generate Lookalike Recommendations for the First 20 Customers

In [11]:
# Initialize a dictionary to store lookalike recommendations
lookalike_recommendations = {}

# Loop through the first 20 customers
for customer_id in customer_profiles_encoded['CustomerID'].iloc[:20]:
    top_lookalikes = get_top_lookalikes(customer_id, similarity_df)
    lookalike_recommendations[customer_id] = list(top_lookalikes.items())

# Convert the dictionary to a DataFrame
lookalike_df = pd.DataFrame(lookalike_recommendations.items(), columns=['CustomerID', 'Lookalikes'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the lookalike recommendations
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [(C0025, 0.9891841805968491), (C0104, 0.607483...
1       C0002  [(C0173, 0.9399021203436735), (C0090, 0.563642...
2       C0003  [(C0031, 0.9956637935839785), (C0181, 0.991895...
3       C0004  [(C0165, 0.7124725375508373), (C0175, 0.710029...
4       C0005  [(C0149, 0.537584045138385), (C0056, 0.5374299...
5       C0006  [(C0082, 0.6052527949753345), (C0087, 0.603576...
6       C0007  [(C0200, 0.5486261920266132), (C0138, 0.548261...
7       C0008  [(C0030, 0.9567774935305903), (C0109, 0.719438...
8       C0009  [(C0119, 0.5175888779684555), (C0103, 0.516952...
9       C0010  [(C0124, 0.5724172208295376), (C0017, 0.571854...
10      C0011  [(C0171, 0.996146048118673), (C0188, 0.6167672...
11      C0012  [(C0133, 0.9684906209735875), (C0165, 0.687492...
12      C0013  [(C0165, 0.6873913170683321), (C0188, 0.679909...
13      C0014  [(C0060, 0.500197803213111), (C0198, 0.4991787...
14      C0015  [(C0131, 0