# **Task 2: Lookalike Model**


In [9]:
# Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [10]:
# Preparation of data for Lookalike Model
transactions_customers = transactions.merge(customers, on='CustomerID') # Transactions got merged with customers using CustomerID column
customer_transaction_data = transactions_customers[['CustomerID', 'Region', 'TotalValue', 'Quantity']]
customer_transaction_summary = customer_transaction_data.groupby(['CustomerID', 'Region']).sum().reset_index()

In [11]:
# Encode region data (Converts categorical data to numerical data)
le = LabelEncoder()
customer_transaction_summary['RegionEncoded'] = le.fit_transform(customer_transaction_summary['Region'])

In [12]:
# Normalizing the data (Helps in preventing bias from large values)
normalized_data = customer_transaction_summary[['TotalValue', 'Quantity', 'RegionEncoded']].apply(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

In [13]:
# Calculate cosine similarity (calculates how similar the customers are based on normalized transaction features)
similarity_matrix = cosine_similarity(normalized_data)
customer_ids = customer_transaction_summary['CustomerID'].values

In [14]:
# Finding out top 3 similar customers for the first 20 customers
lookalike_results = {}
for idx in range(20):  # Selecting first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3_similar = [(customer_ids[i], score) for i, score in similarity_scores[1:4]]
    lookalike_results[customer_ids[idx]] = top_3_similar

In [15]:
# Conversion of the results to a CSV file
lookalike_df = pd.DataFrame([{'cust_id': key, 'lookalikes': value} for key, value in lookalike_results.items()])
lookalike_df.to_csv('Lookalike_Results.csv', index=False)