In [1]:
# Import required libraries
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

# Suppress warnings to ensure a clean output
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r"data\Weight_Loss_App_Data.csv")
# Display the first few rows of the dataset
df.head()

Unnamed: 0,ID,Age,Gender,Income,Education,Occupation,Marital Status,Number of Children,City,State,...,Exercise Frequency,Smoking Status,Alcohol Consumption,Blood Pressure,Cholesterol Level,Diabetes Status,Allergies,Mental Health,Engagement Score,Label
0,1001,28,Male,55000,Bachelor's,Engineer,Single,0,New York,NY,...,3 times a week,Non-smoker,Occasionally,120/80,Normal,No,,Good,61.0,Customer
1,1002,34,Female,72000,Master's,Teacher,Married,2,Los Angeles,CA,...,Daily,Non-smoker,Non-drinker,110/70,Normal,No,Peanuts,Excellent,81.0,Customer
2,1003,45,Male,82000,Doctorate,Scientist,Married,3,Chicago,IL,...,Weekly,Non-smoker,Occasionally,130/85,High,No,,Good,18.0,Customer
3,1004,29,Female,65000,Bachelor's,Accountant,Single,0,San Francisco,CA,...,2 times a week,Non-smoker,Non-drinker,115/75,Normal,No,Dust,Good,64.0,Customer
4,1005,52,Male,90000,High School,Manager,Married,4,Houston,TX,...,Weekly,Smoker,Frequently,140/90,High,Yes,Pollen,Fair,0.0,Customer


## Preprocessing

In [2]:
categorical_cols = ['Gender', 'Education', 'Occupation', 'Marital Status', 'City', 'State', 'Country', 
                       'Home Ownership', 'Car Ownership', 'Internet Access', 'Health Condition', 
                       'Exercise Frequency', 'Smoking Status', 'Alcohol Consumption', 'Cholesterol Level', 
                       'Diabetes Status', 'Allergies', 'Mental Health']
numeric_cols = ['Age', 'Income', 'Number of Children', 'Height', 'Weight', 'Systolic BP', 'Diastolic BP']

In [3]:
# Function to split blood pressure into systolic and diastolic
def split_blood_pressure(bp):
    systolic, diastolic = bp.split('/')
    return int(systolic), int(diastolic)

# Apply the function to create new columns
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].apply(lambda x: split_blood_pressure(x)).apply(pd.Series)

# Drop the original 'Blood Pressure' column
df = df.drop(columns=['Blood Pressure'])

In [4]:
# Separate Customers from Prospects
customers_df= df[(df['Label'] == 'Customer')].reset_index(drop=True)
prospects_df = df[(df['Label'] == 'Prospect')].reset_index(drop=True)

In [5]:
# Initialize the OneHotEncoder
# Important: Fit the encoder only on Customers to prevent Prospects from introducing unseen categories
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')  # handle_unknown='ignore' handles unseen categories gracefully

# Fit the encoder on Customers' categorical data
encoder.fit(customers_df[categorical_cols])

# Transform both Customers and Prospects
encoded_customers = encoder.transform(customers_df[categorical_cols])

for col in categorical_cols:
    prospects_df[col] = prospects_df[col].apply(lambda x: x if x in encoder.categories_[categorical_cols.index(col)] else 'Unknown')
encoded_prospects = encoder.transform(prospects_df[categorical_cols])

# Convert encoded features to DataFrame with appropriate column names
encoded_feature_names = encoder.get_feature_names(categorical_cols)
encoded_customers_df = pd.DataFrame(encoded_customers, columns=encoded_feature_names)
encoded_prospects_df = pd.DataFrame(encoded_prospects, columns=encoded_feature_names)

# Ensure integer type
encoded_customers_df = encoded_customers_df.astype(int)
encoded_prospects_df = encoded_prospects_df.astype(int)

In [6]:
# Standardize all numerical columns
scaler = StandardScaler()

scaled_customers = scaler.fit_transform(customers_df[numeric_cols])
scaled_prospects = scaler.transform(prospects_df[numeric_cols])

scaled_customers_df = pd.DataFrame(scaled_customers, columns=numeric_cols, index=customers_df.index)
scaled_prospects_df = pd.DataFrame(scaled_prospects, columns=numeric_cols, index=prospects_df.index)

In [7]:
# Combine All Features for Customers and Prospects
# For Customers
combined_features_customers = pd.concat([
    scaled_customers_df.reset_index(drop=True),
    encoded_customers_df.reset_index(drop=True)
], axis=1)

# For Prospects
combined_features_prospects = pd.concat([
    scaled_prospects_df.reset_index(drop=True),
    encoded_prospects_df.reset_index(drop=True)
], axis=1)

## Cosine Similarity

In [8]:
# First, extract the metadata for customers and prospects from the original df.
customer_info = df[df['Label'] == 'Customer'][['ID', 'Engagement Score']].reset_index(drop=True)
prospect_info = df[df['Label'] == 'Prospect'][['ID']].reset_index(drop=True)

# Compute the cosine similarity between each prospect and all customers
similarity_matrix = cosine_similarity(combined_features_prospects, combined_features_customers)

In [9]:
results = []

# For each prospect, find the top 5 most similar customers.
for i, prospect_id in tqdm(enumerate(prospect_info['ID']), total=len(prospect_info['ID'])):
    # Similarity scores for the current prospect against all customers.
    similarities = similarity_matrix[i]
    
    # Get the indices of the top 5 customers (highest similarity scores).
    top_5_indices = np.argsort(similarities)[-5:][::-1]
    
    # Retrieve the corresponding customer IDs and engagement scores.
    cs_similar_customer_ids = customer_info.iloc[top_5_indices]['ID'].values
    cs_similar_engagement_scores = customer_info.iloc[top_5_indices]['Engagement Score'].values
    
    # Calculate the expected engagement score as the average of the 5 customers' scores.
    expected_engagement = np.mean(cs_similar_engagement_scores)
    
    # Append the result in a dictionary.
    results.append({
        "Prospect_ID": prospect_id,
        "Similar_Customer_ID_1": cs_similar_customer_ids[0],
        "Similar_Customer_ID_2": cs_similar_customer_ids[1],
        "Similar_Customer_ID_3": cs_similar_customer_ids[2],
        "Similar_Customer_ID_4": cs_similar_customer_ids[3],
        "Similar_Customer_ID_5": cs_similar_customer_ids[4],
        "Expected_Engagement_Score": expected_engagement
    })

# Convert the list of dictionaries into a DataFrame.
output_cs_df = pd.DataFrame(results)
output_cs_df.head()

100%|████████████████████████████████████████████████████████████████████████████| 1892/1892 [00:01<00:00, 1506.90it/s]


Unnamed: 0,Prospect_ID,Similar_Customer_ID_1,Similar_Customer_ID_2,Similar_Customer_ID_3,Similar_Customer_ID_4,Similar_Customer_ID_5,Expected_Engagement_Score
0,9001,1559,1663,8351,6771,6164,30.6
1,9002,4192,7812,4502,1170,2775,13.6
2,9003,3435,1107,7703,6154,8412,26.0
3,9004,8078,3175,8721,7069,4318,36.2
4,9005,5500,5805,3486,4356,5644,25.6


## k-Nearest Neighbors (k-NN)

In [10]:
# Fit KNN on Customers Only
knn = NearestNeighbors(n_neighbors=5, metric='euclidean', n_jobs=14)
knn.fit(combined_features_customers)

NearestNeighbors(metric='euclidean', n_jobs=14)

In [11]:
# Find Nearest Neighbors for Each Prospect
distances, knn_indices = knn.kneighbors(combined_features_prospects)

In [12]:
results_knn = []
epsilon = 1e-9  # A small value to avoid division by zero.

# Loop over each prospect (using tqdm for progress display)
for i, prospect_id in tqdm(enumerate(prospect_info['ID']), total=len(prospect_info['ID'])):
    # Get indices and distances for this prospect's nearest neighbors.
    current_indices = knn_indices[i]
    current_dists = distances[i]
    
    # Retrieve the corresponding customer IDs and their Engagement Scores.
    knn_similar_customer_ids = customer_info.iloc[current_indices]['ID'].values
    knn_similar_engagement_scores = customer_info.iloc[current_indices]['Engagement Score'].values
    
    # Retrieve the distances for these customers.
    current_dists = distances[i]
    
    # Calculate weights as the inverse of distances (adding epsilon to avoid division by zero).
    weights = 1.0 / (current_dists + epsilon)
    
    # Compute the weighted average of Engagement Score.
    weighted_engagement = np.sum(weights * knn_similar_engagement_scores) / np.sum(weights)
    
    # Round the weighted engagement to one decimal point.
    weighted_engagement = round(weighted_engagement, 1)
    
    results_knn.append({
        "Prospect_ID": prospect_id,
        "Similar_Customer_ID_1": knn_similar_customer_ids[0],
        "Similar_Customer_ID_2": knn_similar_customer_ids[1],
        "Similar_Customer_ID_3": knn_similar_customer_ids[2],
        "Similar_Customer_ID_4": knn_similar_customer_ids[3],
        "Similar_Customer_ID_5": knn_similar_customer_ids[4],
        "Expected_Engagement_Score": weighted_engagement
    })

# Convert the list of dictionaries to a DataFrame.
output_knn_df = pd.DataFrame(results_knn)
output_knn_df.head()

100%|████████████████████████████████████████████████████████████████████████████| 1892/1892 [00:00<00:00, 3561.92it/s]


Unnamed: 0,Prospect_ID,Similar_Customer_ID_1,Similar_Customer_ID_2,Similar_Customer_ID_3,Similar_Customer_ID_4,Similar_Customer_ID_5,Expected_Engagement_Score
0,9001,1663,1559,6338,5772,2611,27.8
1,9002,4192,7812,4502,1170,2775,13.7
2,9003,3435,1107,7703,1261,6154,34.3
3,9004,3175,8078,8721,4318,7069,36.2
4,9005,5500,5805,5295,7296,3682,31.6
