<a href="https://colab.research.google.com/github/praveenravi01/data-science-assignment/blob/main/Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Ensure consistent column names
customers.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
transactions.rename(columns={'Product ID': 'ProductID'}, inplace=True)

# Merge datasets for analysis
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Debug: Check merged data columns
print("Merged Data Columns:", merged_data.columns)

# Aggregating data to create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price_y': 'mean',  # Using Price from the Products.csv
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

# Rename 'Price_y' to 'Price' for clarity
customer_profiles.rename(columns={'Price_y': 'Price'}, inplace=True)

# Debug: Check aggregated customer_profiles
print("Customer Profiles Columns:", customer_profiles.columns)
print(customer_profiles.head())

# One-hot encode the categorical 'Category' column
customer_profiles = pd.get_dummies(customer_profiles, columns=['Category'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.drop(['CustomerID'], axis=1))

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Creating the lookalike recommendations
lookalike_results = {}
customer_ids = customer_profiles['CustomerID'].values

for idx, customer_id in enumerate(customer_ids[:20]):  # For the first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding itself
    lookalike_results[customer_id] = [
        (customer_ids[sim_idx], round(score, 4)) for sim_idx, score in similar_customers
    ]

# Convert to DataFrame for output
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [str(val) for val in lookalike_results.values()]
})

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations for first 20 customers saved to 'Lookalike.csv'")


Merged Data Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
Customer Profiles Columns: Index(['CustomerID', 'Quantity', 'TotalValue', 'Price', 'Category'], dtype='object')
  CustomerID  Quantity  TotalValue       Price     Category
0      C0001        12     3354.52  278.334000  Electronics
1      C0002        10     1862.74  208.920000     Clothing
2      C0003        14     2725.38  195.707500   Home Decor
3      C0004        23     5354.88  240.636250        Books
4      C0005         7     2034.24  291.603333  Electronics
Lookalike recommendations for first 20 customers saved to 'Lookalike.csv'
