# Task 2: Lookalike Model

**Import necessary libraries**

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

**File paths**

In [2]:
customer_file = '/kaggle/input/dataset3/Customers.csv'
product_file = '/kaggle/input/dataset3/Products.csv'
transaction_file = '/kaggle/input/dataset3/Transactions.csv'

**Load the datasets**

In [3]:
customers = pd.read_csv(customer_file)
products = pd.read_csv(product_file)
transactions = pd.read_csv(transaction_file)

**Merge datasets**

In [4]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

**Inspect the dataset structure**

In [5]:
print("Customers.csv Columns:", customers.columns)
print("Products.csv Columns:", products.columns)
print("Transactions.csv Columns:", transactions.columns)
print("Merged Data Columns:", merged_data.columns)

Customers.csv Columns: Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Products.csv Columns: Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Transactions.csv Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Merged Data Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


**Adjust feature engineering based on available columns**

In [6]:
agg_dict = {}
if 'Price' in merged_data.columns:
    agg_dict['Price'] = 'mean'
if 'Quantity' in merged_data.columns:
    agg_dict['Quantity'] = 'sum'
if 'TotalValue' in merged_data.columns:
    agg_dict['TotalValue'] = 'sum'
if 'Category' in merged_data.columns:
    agg_dict['Category'] = lambda x: x.value_counts().index[0]  # Most common category

if not agg_dict:
    raise ValueError("No valid columns available for aggregation.")

**Aggregate transaction and product data for each customer**

In [7]:
customer_features = merged_data.groupby('CustomerID').agg(agg_dict).reset_index()

**One-hot encode categorical features (e.g., Category)**

In [8]:
if 'Category' in customer_features.columns:
    customer_features = pd.get_dummies(customer_features, columns=['Category'], prefix='Cat')

**Scale the features for similarity computation**

In [9]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(['CustomerID'], axis=1))

**Compute similarity scores using cosine similarity**

In [10]:
similarity_matrix = cosine_similarity(scaled_features)

**Create the lookalike model output**

In [11]:
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    similar_customers = list(enumerate(similarity_matrix[i]))
    similar_customers = [(idx, score) for idx, score in similar_customers if idx != i]
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_features.iloc[idx]['CustomerID'], score) for idx, score in similar_customers[:3]]
    lookalike_map[customer_id] = top_3

**Save the results to Lookalike.csv**

In [12]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(l) for l in lookalike_map.values()]
})
output_file = 'Shruti_Narwat_Lookalike.csv'
lookalike_df.to_csv(output_file, index=False)

print(f"Lookalike model output saved as '{output_file}'.")

Lookalike model output saved as 'Shruti_Narwat_Lookalike.csv'.
