## Imports and Loads Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
import matplotlib.pyplot as plt

In [15]:
# Load the dataset
df = pd.read_csv('course_lead_scoring.csv')

In [19]:
# Visualize the dataset
df.head(10)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94
1,social_media,retail,1,46992.0,employed,south_america,1,0.8
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69
3,paid_ads,retail,2,83843.0,,australia,1,0.87
4,referral,education,3,85012.0,self_employed,europe,3,0.62
5,events,manufacturing,1,59904.0,,africa,6,0.83
6,social_media,technology,0,51283.0,,middle_east,2,0.57
7,social_media,,5,62975.0,student,europe,4,0.62
8,referral,healthcare,4,38648.0,unemployed,south_america,2,0.86
9,paid_ads,other,3,59866.0,student,australia,3,0.43


In [16]:
# Separate target
y = df['converted']
df = df.drop('converted', axis=1)

## Data Preparation
### 1. Check for missing values

In [5]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
dtype: int64

##### We observe missing values in several columns.

### 2. Impute missing values

In [7]:
# Identify categorical and numerical columns
categorical = df.select_dtypes(include=['object']).columns.tolist()
numerical = df.select_dtypes(exclude=['object']).columns.tolist()

# Fill missing values
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)

### 3. Train/Validation/Test Split

In [8]:
df_train, df_temp, y_train, y_temp = train_test_split(
    df, y, test_size=0.4, random_state=1, stratify=y
)
df_val, df_test, y_val, y_test = train_test_split(
    df_temp, y_temp, test_size=0.5, random_state=1, stratify=y_temp
)

## Question 1: ROC AUC Feature Importance

In [9]:
numerical_features = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

auc_scores = {}
for col in numerical_features:
    auc = roc_auc_score(y_train, df_train[col])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[col])
        auc_scores[col] = auc
    else:
        auc_scores[col] = auc

# Find the feature with the highest AUC
best_feature = max(auc_scores, key=auc_scores.get)
best_feature, auc_scores[best_feature]

('number_of_courses_viewed', 0.7648046448539385)

## Question 2: Train Logistic Regression and Compute AUC on Validation

In [10]:
# Prepare data using DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

# Predict probabilities
y_pred_val = model.predict_proba(X_val)[:, 1]

# Compute AUC
auc_val = roc_auc_score(y_val, y_pred_val)
round(auc_val, 3)

0.866

## Question 3: Precision and Recall Curves

In [11]:
precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_val)

# Find intersection point (where precision ≈ recall)
diff = np.abs(precisions - recalls)
idx = np.argmin(diff)
intersection_threshold = thresholds[idx] if idx < len(thresholds) else 1.0

# Round to 3 decimals
round(intersection_threshold, 3)

np.float64(0.616)

## Question 4: F1 Score Maximization

In [12]:
f1_scores = []
thresholds_f1 = np.arange(0.0, 1.01, 0.01)

for t in thresholds_f1:
    y_pred_t = (y_pred_val >= t).astype(int)
    f1 = f1_score(y_val, y_pred_t)
    f1_scores.append(f1)

best_idx = np.argmax(f1_scores)
best_threshold_f1 = thresholds_f1[best_idx]
round(best_threshold_f1, 2)

np.float64(0.58)

## Question 5: 5-Fold Cross-Validation (AUC Std Dev)

In [13]:
df_full_train = pd.concat([df_train, df_val])
y_full_train = pd.concat([y_train, y_val])

kf = KFold(n_splits=5, shuffle=True, random_state=1)

auc_scores_cv = []

for train_idx, val_idx in kf.split(df_full_train):
    df_tr = df_full_train.iloc[train_idx]
    y_tr = y_full_train.iloc[train_idx]
    df_vl = df_full_train.iloc[val_idx]
    y_vl = y_full_train.iloc[val_idx]
    
    # Vectorize
    dv = DictVectorizer(sparse=False)
    X_tr = dv.fit_transform(df_tr.to_dict(orient='records'))
    X_vl = dv.transform(df_vl.to_dict(orient='records'))
    
    # Train
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_tr, y_tr)
    
    # Evaluate
    y_pred = model.predict_proba(X_vl)[:, 1]
    auc = roc_auc_score(y_vl, y_pred)
    auc_scores_cv.append(auc)

np.std(auc_scores_cv).round(4)

np.float64(0.0306)

## Question 6: Hyperparameter Tuning (C values)

In [14]:
C_values = [0.000001, 0.001, 1]
results = {}

for C in C_values:
    auc_scores = []
    for train_idx, val_idx in kf.split(df_full_train):
        df_tr = df_full_train.iloc[train_idx]
        y_tr = y_full_train.iloc[train_idx]
        df_vl = df_full_train.iloc[val_idx]
        y_vl = y_full_train.iloc[val_idx]
        
        dv = DictVectorizer(sparse=False)
        X_tr = dv.fit_transform(df_tr.to_dict(orient='records'))
        X_vl = dv.transform(df_vl.to_dict(orient='records'))
        
        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_tr, y_tr)
        
        y_pred = model.predict_proba(X_vl)[:, 1]
        auc = roc_auc_score(y_vl, y_pred)
        auc_scores.append(auc)
    
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    results[C] = (round(mean_auc, 3), round(std_auc, 3))

results

{1e-06: (np.float64(0.553), np.float64(0.014)),
 0.001: (np.float64(0.867), np.float64(0.019)),
 1: (np.float64(0.826), np.float64(0.031))}