In [2]:
pip install pandas scikit-learn numpy





[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
 

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

In [5]:
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID')


In [6]:
# Aggregate transaction data for each customer
customer_data = merged_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    num_transactions=('TransactionID', 'nunique')
).reset_index()


In [7]:
# 2. Product preferences calculation
# Merge with Products to get product categories for each transaction
merged_with_products = pd.merge(merged_df, products_df, on='ProductID')

# Aggregate product preferences by category for each customer
category_preferences = merged_with_products.groupby(['CustomerID', 'Category']).agg(
    category_spend=('TotalValue', 'sum')
).unstack(fill_value=0)
category_preferences.columns = category_preferences.columns.droplevel()

# Combine customer data and category preferences
final_customer_profiles = pd.concat([customer_data.set_index('CustomerID'), category_preferences], axis=1)

In [8]:
# 3. Normalization: Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_customer_profiles)

# 4. Similarity Calculation: Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# 5. Find the top 3 most similar customers
lookalike_results = []
for idx, customer_id in enumerate(final_customer_profiles.index):
    # Get pairwise similarities for this customer
    similarities = similarity_matrix[idx]
    # Exclude self-similarity
    similar_indices = np.argsort(similarities)[::-1][1:4]
    
    similar_customers = [(final_customer_profiles.index[i], similarities[i]) for i in similar_indices]
    
    lookalike_results.append({
        'cust_id': customer_id,
        'similar_customers': similar_customers
    })



In [11]:
# 6. Save the results in a CSV file
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df['similar_customers'] = lookalike_df['similar_customers'].apply(lambda x: str(x))

lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

print("Lookalike model has been saved as 'ANDRAJU_SAI LIKHITHA_Lookalike.csv'.")

Lookalike model has been saved as 'ANDRAJU_SAI LIKHITHA_Lookalike.csv'.
