# Installing dependencies
 Importing libraries and load Data

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Data Preprocessing
- Cleaning and merging customer and product information to create a feature set.
- Ensuring that the data has no missing values and it is in the correct format.

In [2]:
merged_df = pd.merge(transactions_df, customers_df, on="CustomerID")

In [14]:
transactions_df.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')

# Feature Engineering

#### We need to create a feature set that will help in calculating the similarity score between customers. The features can include:
 1. Demographic features from the customer dataset.
 2. Transaction history features, such as total spend, frequency, etc

In [5]:
merged_df['Total_Spend'] = merged_df['TotalValue'].groupby(merged_df['CustomerID']).transform('sum')
merged_df['Transaction_Frequency'] = merged_df['TotalValue'].groupby(merged_df['CustomerID']).transform('count')

# Standardizing Features
Normalizing the numerical features to ensure that all features contribute equally to the similarity calculation.

In [6]:
scaler = StandardScaler()
numerical_features = ['Total_Spend', 'Transaction_Frequency']
merged_df[numerical_features] = scaler.fit_transform(merged_df[numerical_features])

# Calculating Similarity Scores - Cosine Similarity
Used cosine similarity to calculate the similarity between the customers based on their transaction and demographic information.

In [7]:
customer_features = merged_df[['Total_Spend', 'Transaction_Frequency']].drop_duplicates()
similarity_matrix = cosine_similarity(customer_features)

# Creating Lookalike Recommendations
For each of the first 20 customers (C0001 to C0020), calculated the top 3 most similar customers and their similarity scores.

In [8]:
lookalike_recommendations = {}

for cust_id in range(1, 21):
    customer_index = cust_id - 1  
    similarity_scores = similarity_matrix[customer_index]
    similar_customers = np.argsort(similarity_scores)[-4:-1]  
    lookalike_recommendations[f"C{cust_id:04d}"] = [(f"C{similar_cust + 1:04d}", similarity_scores[similar_cust]) for similar_cust in similar_customers]


In [15]:
lookalike_recommendations["C0001"]

[('C0174', 0.9999420199247117),
 ('C0190', 0.9999816835278627),
 ('C0093', 0.9999982810471502)]

# Saved Recommendations to a CSV



In [13]:
customer_ids_1 = []
similarity_scores_1 = []
customer_ids_2 = []
similarity_scores_2 = []
customer_ids_3 = []
similarity_scores_3 = []

for cust_id, recommendations in lookalike_recommendations.items():
    customer_ids_1.append(recommendations[0][0])
    similarity_scores_1.append(recommendations[0][1])
    customer_ids_2.append(recommendations[1][0])
    similarity_scores_2.append(recommendations[1][1])
    customer_ids_3.append(recommendations[2][0])
    similarity_scores_3.append(recommendations[2][1])

lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_recommendations.keys()),
    'CustomerID_1': customer_ids_1,
    'Similarity_Score_1': similarity_scores_1,
    'CustomerID_2': customer_ids_2,
    'Similarity_Score_2': similarity_scores_2,
    'CustomerID_3': customer_ids_3,
    'Similarity_Score_3': similarity_scores_3
})

lookalike_df.to_csv('Sayantani_Chakraborty_Lookalike.csv', index=False)
