Importing Libraries

In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

Data Preprocessing

In [5]:
# Group transactions to calculate total, average spending and transaction count per customer
customer_aggregates = transactions.groupby('CustomerID')['TotalValue'].agg(
    TotalSpending='sum', AvgSpending='mean', TransactionCount='count'
).reset_index()

# Merge aggregated features with customer data
customer_features = customers.merge(customer_aggregates, on='CustomerID', how='left')

# Handle missing data
customer_features.fillna(0, inplace=True)

# Data Preprocessing: Drop non-numeric columns and ensure numeric data
feature_matrix = customer_features.set_index('CustomerID').drop(['CustomerName', 'SignupDate'], axis=1)
feature_matrix = feature_matrix.apply(pd.to_numeric, errors='coerce').fillna(0)


In [6]:
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=feature_matrix.index, columns=feature_matrix.index)

In [15]:
# Lookalike Recommendations for Customers C0001 - C0010
lookalike_recommendations = {}

for customer_id in feature_matrix.index[:10]:
    # Extract top 3 similarity scores for the customer, excluding the customer themselves
    similar_customers = similarity_df[customer_id].nlargest(4).iloc[1:]
    
    # Store the top 3 recommendations in the dictionary
    lookalike_recommendations[customer_id] = [(other_customer, score) 
                                              for other_customer, score in similar_customers.items()]


In [16]:
# save as CSV
lookalike_data = [
    {'CustomerID': customer_id, 'LookalikeCustomerID': other_customer, 'SimilarityScore': score}
    for customer_id, recommendations in lookalike_recommendations.items()
    for other_customer, score in recommendations
]

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Harish_Sudhakar_Lookalike.csv', index=False)


In [17]:
print(lookalike_df.head())

  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0137              1.0
1      C0001               C0152              1.0
2      C0001               C0056              1.0
3      C0002               C0029              1.0
4      C0002               C0199              1.0
