In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 3.4 MB/s eta 0:00:04
   --- ------------------------------------ 1.0/11.1 MB 3.1 MB/s eta 0:00:04
   ----- ---------------------------------- 1.6/11.1 MB 3.0 MB/s eta 0:00:04
   --------- ------------------------------ 2.6/11.1 MB 3.4 MB/s eta 0:00:03
   ------------ --------------------------- 3.4/11.1 MB 3.4 MB/s eta 0:00:03
   -------------- -------------------

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Ensure consistent column names
customers.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
transactions.rename(columns={'Product ID': 'ProductID'}, inplace=True)

# Merge datasets for analysis
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Debug: Check merged data columns
print("Merged Data Columns:", merged_data.columns)

# Aggregating data to create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price_y': 'mean',  # Using Price from the Products.csv
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

# Rename 'Price_y' to 'Price' for clarity
customer_profiles.rename(columns={'Price_y': 'Price'}, inplace=True)

# Debug: Check aggregated customer_profiles
print("Customer Profiles Columns:", customer_profiles.columns)
print(customer_profiles.head())

# One-hot encode the categorical 'Category' column
customer_profiles = pd.get_dummies(customer_profiles, columns=['Category'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.drop(['CustomerID'], axis=1))

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Creating the lookalike recommendations
lookalike_results_list = []
customer_ids = customer_profiles['CustomerID'].values

for idx, customer_id in enumerate(customer_ids[:20]):  # For the first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding itself
    for sim_idx, score in similar_customers:
        lookalike_results_list.append({
            'CustomerID': customer_id,
            'LookalikeID': customer_ids[sim_idx],
            'SimilarityScore': round(score, 4)
        })

# Convert the results to a DataFrame
lookalike_df = pd.DataFrame(lookalike_results_list)

# Save to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'")


Merged Data Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
Customer Profiles Columns: Index(['CustomerID', 'Quantity', 'TotalValue', 'Price', 'Category'], dtype='object')
  CustomerID  Quantity  TotalValue       Price     Category
0      C0001        12     3354.52  278.334000  Electronics
1      C0002        10     1862.74  208.920000     Clothing
2      C0003        14     2725.38  195.707500   Home Decor
3      C0004        23     5354.88  240.636250        Books
4      C0005         7     2034.24  291.603333  Electronics
Lookalike recommendations saved to 'Lookalike.csv'
