In [1]:
# Import required libraries
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

# Suppress warnings to ensure a clean output
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r"data\Weight_Loss_App_Data.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,ID,Age,Gender,Income,Education,Occupation,Marital Status,Number of Children,City,State,...,Exercise Frequency,Smoking Status,Alcohol Consumption,Blood Pressure,Cholesterol Level,Diabetes Status,Allergies,Mental Health,Engagement Score,Label
0,1001,28,Male,55000,Bachelor's,Engineer,Single,0,New York,NY,...,3 times a week,Non-smoker,Occasionally,120/80,Normal,No,,Good,61.0,Customer
1,1002,34,Female,72000,Master's,Teacher,Married,2,Los Angeles,CA,...,Daily,Non-smoker,Non-drinker,110/70,Normal,No,Peanuts,Excellent,81.0,Customer
2,1003,45,Male,82000,Doctorate,Scientist,Married,3,Chicago,IL,...,Weekly,Non-smoker,Occasionally,130/85,High,No,,Good,18.0,Customer
3,1004,29,Female,65000,Bachelor's,Accountant,Single,0,San Francisco,CA,...,2 times a week,Non-smoker,Non-drinker,115/75,Normal,No,Dust,Good,64.0,Customer
4,1005,52,Male,90000,High School,Manager,Married,4,Houston,TX,...,Weekly,Smoker,Frequently,140/90,High,Yes,Pollen,Fair,0.0,Customer


## Preprocessing

In [2]:
# Function to split blood pressure into systolic and diastolic
def split_blood_pressure(bp):
    systolic, diastolic = bp.split('/')
    return int(systolic), int(diastolic)

# Apply the function to create new columns
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].apply(lambda x: split_blood_pressure(x)).apply(pd.Series)

# Drop the original 'Blood Pressure' column
df = df.drop(columns=['Blood Pressure'])

In [3]:
# Drop the 'Engagement Score' and 'Label' columns for similarity calculation
features_df = df.drop(columns=['Engagement Score', 'Label'])

# One-hot encode categorical features
categorical_columns = ['Gender', 'Education', 'Occupation', 'Marital Status', 'City', 'State', 'Country', 
                       'Home Ownership', 'Car Ownership', 'Internet Access', 'Health Condition', 
                       'Exercise Frequency', 'Smoking Status', 'Alcohol Consumption', 'Cholesterol Level', 
                       'Diabetes Status', 'Allergies', 'Mental Health']

encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(features_df[categorical_columns])

# This code uses get_feature_names instead of get_feature_names_out to ensure compatibility with older versions of scikit-learn.
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names(categorical_columns))

# Standardize numerical features
numeric_columns = ['Age', 'Income', 'Number of Children', 'Height', 'Weight', 'Systolic BP', 'Diastolic BP']
scaler = StandardScaler()
scaled_numeric_features = scaler.fit_transform(features_df[numeric_columns])

scaled_numeric_features_df = pd.DataFrame(scaled_numeric_features, columns=numeric_columns)

# Combine the encoded categorical features and scaled numerical features
combined_features_df = pd.concat([scaled_numeric_features_df.reset_index(drop=True), encoded_features_df.reset_index(drop=True)], axis=1)

## Cosine Similarity

In [4]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features_df)

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df['ID'], columns=df['ID'])

prospect_ids = df[df['Label'] == 'Prospect']['ID']
customer_ids = df[df['Label'] == 'Customer']['ID']
engagement_scores = df.set_index('ID')['Engagement Score']

# Create a DataFrame for average engagement scores
prospect_similar_CS_df = pd.DataFrame()
average_engagement_scores = []

# Calculate average engagement scores for each prospect based on their top 10 similar customers
for prospect_id in tqdm(prospect_ids):
    similar_customers = similarity_df.loc[prospect_id, customer_ids].nlargest(10).index
    average_score = engagement_scores.loc[similar_customers].mean()
    average_engagement_scores.append(average_score)
    
# Create the DataFrame with prospect IDs and their average similar engagement scores
prospect_similar_CS_df['ID'] = prospect_ids
prospect_similar_CS_df['Expected_Engagement_Score'] = average_engagement_scores

# Sort based on expected engagement score and reset the index for better readability
prospect_similar_CS_df = prospect_similar_CS_df.sort_values(by='Expected_Engagement_Score',ascending=False).reset_index(drop=True)
prospect_similar_CS_df.head()

100%|████████████████████████████████████████████████████████████████████████████| 1892/1892 [00:01<00:00, 1091.21it/s]


Unnamed: 0,ID,Expected_Engagement_Score
0,10465,65.3
1,10705,63.7
2,10415,61.7
3,9397,61.3
4,9876,61.0


## k-Nearest Neighbors (k-NN)

In [10]:
# Fit the KNN model with Euclidean distance
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(combined_features_df)

# Find the 10 nearest neighbors for each prospect
distances, indices = knn.kneighbors(combined_features_df.loc[df['Label'] == 'Prospect'])

# Calculate the average engagement score for the 10 most similar customers for each prospect
prospect_ids = df[df['Label'] == 'Prospect']['ID']
engagement_scores = df.set_index('ID')['Engagement Score']

average_engagement_scores = []

for i, prospect_id in tqdm(enumerate(prospect_ids)):
    similar_customers = indices[i]
    similar_customers_ids = combined_features_df.iloc[similar_customers].index
    similar_customer_engagement_scores = engagement_scores.reindex(similar_customers_ids).dropna()
    average_score = similar_customer_engagement_scores.mean()
    average_engagement_scores.append({
        'ID': prospect_id,
        'Expected_Engagement_Score': average_score,
        **{f'Similar Customer ID {j+1}': similar_customers_ids[j] for j in range(len(similar_customers_ids))}
    })

# Create the DataFrame with prospect IDs and their average similar engagement scores
prospect_similar_KNN_df = pd.DataFrame(average_engagement_scores).sort_values(by='Expected_Engagement_Score',ascending=False).reset_index(drop=True)
prospect_similar_KNN_df.head()

1892it [00:00, 2249.17it/s]


Unnamed: 0,ID,Expected_Engagement_Score,Similar Customer ID 1,Similar Customer ID 2,Similar Customer ID 3,Similar Customer ID 4,Similar Customer ID 5,Similar Customer ID 6,Similar Customer ID 7,Similar Customer ID 8,Similar Customer ID 9,Similar Customer ID 10
0,10603,53.0,9602,283,4577,2219,382,460,8938,1135,1079,1507
1,9747,51.444444,8746,411,8171,4735,2491,3074,1011,4950,3622,2603
2,10199,51.285714,9198,1109,4791,5018,7883,6669,8158,9016,9263,2360
3,10677,51.285714,9676,5102,9238,5252,8106,9804,3593,6634,1888,2125
4,10126,51.0,9125,1115,7925,377,3386,1097,901,5134,2097,8836
