In [None]:
import warnings

warnings.filterwarnings("ignore")

# data handeling lirbaries
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# model and evaluation libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
    make_scorer,
    average_precision_score,
)

# display settings for readability
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)
pd.set_option("display.float_format", lambda x: f"{x:.5f}")

## Environment Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
learn = pd.read_csv("/content/drive/MyDrive/Python/ExtraaLearn.csv")

In [None]:
data = learn.copy()

## Basic Overview

In [None]:
print("Data shae:", data.shape)
print("Conversion rate:", data["status"].mean())

The dataset contains rows of leads and columns of features. The conversion rate is the proportion of leads with a positive outcome (status=1). This provides context on class imbalance for modelling.

In [None]:
# helper function for visualization

def histogram_boxplot(df, feature, figsize=(12, 7), kde=False, bins="auto"):

    #
    mean_val = df[feature].mean()
    med_val = df[feature].median()

    # set up the figure with two parts: small top, big bottom
    fig, (ax_box, ax_hist) = plt.subplots(
        nrows=2,
        sharex=True,
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize
    )

    # boxplot
    sns.boxplot(x=df[feature], ax=ax_box, showmeans=True, color="violet")
    ax_box.set_title(f"Distribution of {feature}")
    ax_box.set(xlabel='') # hide x-label on the top plot

    # histogram
    sns.histplot(df[feature], kde=kde, ax=ax_hist, bins=bins, stat="density")

    # add refernece lines
    ax_hist.axvline(mean_val, color="green", linestyle="--", label='Mean')
    ax_hist.axvline(med_val, color="black", linestyle="-", label='Median')
    ax_hist.legend()

    plt.tight_layout()
    plt.show()


def labeled_barplot(df, feature, perc=False, n=None):
    """Plots a bar chart. If perc=True, plots percentages instead of counts."""

    plt.figure(figsize=(12, 6))

    # get the data ready before plotting
    data_series = df[feature].value_counts()

    if n:
        data_series = data_series.head(n)

    # convert the data
    if perc:
        total = len(df)
        # convert counts to percentage of total
        data_series = (data_series / total) * 100
        y_label = "Percentage"
        fmt_str = '%.1f%%'
    else:
        y_label = "Count"
        fmt_str = '%.0f'


    #
    ax = sns.barplot(x=data_series.index, y=data_series.values, palette="Paired")

    # add labels using modern matplotlib
    ax.bar_label(ax.containers[0], fmt=fmt_str, padding=3)

    plt.ylabel(y_label)
    plt.xticks(rotation=45) 
    plt.title(f"{feature} - {y_label}")
    plt.tight_layout()
    plt.show()

## Exploratory Analysis

In [None]:
# Drop the ID column and preserve lead IDs separately for ranking output later
lead_ids = data['ID']
data.drop(['ID'], axis=1, inplace=True)

# Numeric features of interest
numeric_features = ["age", "website_visits", "time_spent_on_website", "page_views_per_visit"]

for feature in numeric_features:
    histogram_boxplot(data, feature)

### Observations from Numeric Features
- Longer time spent on the website tends to associate with higher conversion.
- Other numerical features show less pronounced separation between converters and nonâ€‘converters but are included for completeness.

In [None]:
# Selected categorical features for barplots
categorical_features = ["current_occupation", "first_interaction", "profile_completed", "referral"]

for feature in categorical_features:
    labeled_barplot(data, feature, perc=True)

### Observations from Categoritcal Features




## Data Preparation

In [None]:
X = data.drop("status", axis=1)
y = data["status"]

# categorical variable
X = pd.get_dummies(X, drop_first=True)

print(f"Encoded features: {X.columns.tolist()[:5]}...")

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
print("\nTarget balance (normalized):")
print(y_train.value_counts(normalize=True))

In [None]:
dt = DecisionTreeClassifier(random_state=7, class_weight='balanced')
dt.fit(X_train, y_train)

In [None]:
print("Training Performance: ")
train_pred_dt = dt.predict(X_train)
print(classification_report(y_train, train_pred_dt))

print("Test Performance: ")
test_pred_dt = dt.predict(X_test)
print(classification_report(y_test, test_pred_dt))


In [None]:
# confusion matrix for test results
cm = confusion_matrix(y_test, test_pred_dt)

plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Decision Tree: Test Confusion Matrix')
plt.show()

In [None]:
dt_tuned = DecisionTreeClassifier(random_state=7, class_weight='balanced')

dt_param_grid = {
    'max_depth': [3, 4, 5, 7, 10],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [5, 10, 20]
}

# grid search
dt_grid = GridSearchCV(dt_tuned, dt_param_grid, scoring='recall', cv=5, n_jobs=-1)
dt_grid.fit(X_train, y_train)

dt_best = dt_grid.best_estimator_

print("Best Parameters found:", dt_grid.best_params_)
print(f"Best Recall Score (CV): {dt_grid.best_score_:.3f}")

print("\n--- Tuned Decision Tree Performance (Test) ---")
test_pred_dt_best = dt_best.predict(X_test)

print(classification_report(y_test, test_pred_dt_best))

In [None]:
# Random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=7, class_weight='balanced')

# fit the model
rf.fit(X_train, y_train)

print("--- Random Forest: Training Set ---")
train_pred_rf = rf.predict(X_train)

print(classification_report(y_train, train_pred_rf))

# check test performance
print("\n--- Random Forest: Test Set ---")
test_pred_rf = rf.predict(X_test)
print(classification_report(y_test, test_pred_rf))

import pandas as pd
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
print("\nTop 5 Important Features:")
print(feat_importances.nlargest(5))

In [None]:
# Hyperparameter tuning for random forest
rf_tuned = RandomForestClassifier(random_state=7)

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [10, 20, 50],
    'class_weight': ['balanced', 'balanced_subsample']
}

# grid search
rf_grid = GridSearchCV(rf_tuned, rf_param_grid, scoring='recall', cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)

rf_best = rf_grid.best_estimator_

print("Best Parameters Found:", rf_grid.best_params_)
print(f"Best Recall Score: {rf_grid.best_score_:.3f}")

print("\n--- Tuned Random Forest Performance (Test) ---")
test_pred_rf_best = rf_best.predict(X_test)

print(classification_report(y_test, test_pred_rf_best))

In [None]:
# Ranking evaluation and feature importance for random forest

y_train_probs = rf_best.predict_proba(X_train)[:, 1]
y_test_probs = rf_best.predict_proba(X_test)[:, 1]

print("--- Ranking Metrics (Random Forest) ---")

print(f"ROC AUC (Train): {roc_auc_score(y_train, y_train_probs):.3f}")
print(f"ROC AUC (Test):  {roc_auc_score(y_test, y_test_probs):.3f}")
print(f"Average Precision (Test): {average_precision_score(y_test, y_test_probs):.3f}")

top_frac = 0.2
n_top = int(len(y_test) * top_frac)

# get indices of the highest scoring leads
top_indices = np.argsort(y_test_probs)[-n_top:]

# calculate conversion rate for just these leads
top_conversion = y_test.iloc[top_indices].mean()

print(f"\nBusiness Check: Conversion Rate in Top {int(top_frac*100)}% of Leads: {top_conversion:.1%}")

# feature importance
feat_imp_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_best.feature_importances_
})

# sort
feat_imp_df = feat_imp_df.sort_values('importance', ascending=False)

print("\nTop 10 Drivers:")
print(feat_imp_df.head(10))

# plot
plt.figure(figsize=(10, 6))

sns.barplot(y='feature', x='importance', data=feat_imp_df.head(15), color='steelblue')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

In [None]:

full_scores = rf_best.predict_proba(X)[:, 1]

ranked_leads = pd.DataFrame({
    'lead_id': lead_ids, # Defined in the very first step
    'conversion_score': full_scores
})

# Sort high to low so the sales team sees the best leads first
ranked_leads = ranked_leads.sort_values('conversion_score', ascending=False)

# save to csv
ranked_leads.to_csv('ranked_leads.csv', index=False)

print("Success! Ranked leads saved to 'ranked_leads.csv'. Preview:")
print(ranked_leads.head(10))