# Imports

In [5]:
# 0. Imports & basic setup

import pandas as pd
import numpy as np

from datetime import datetime

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    classification_report
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from xgboost import XGBClassifier

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

RANDOM_STATE = 42

pd.set_option("display.max_columns", 100)


# Intro & Business Framing

# Part 3 – Predictive Intelligence: Lead Prioritization

## 1. Business Problem

Sales teams are overwhelmed with leads coming from Facebook campaigns.  
Some leads will eventually become qualified opportunities or closed deals, but many will not.  
Treating all leads equally wastes time and slows down response times for high-value leads.

## 2. Objective

Build a **binary classification model** that estimates the probability that a new lead will become a **converted / high-quality lead**.

We will use this model to:

- Prioritise **which leads should be called first** (High / Medium / Low priority).
- Help sales teams focus their time on the leads with the **highest likelihood of conversion**.
- Provide a score that can be surfaced in dashboards or CRM workflows.

## 3. Conversion Definition

For this analysis, we define a **converted lead** as a lead whose final status is one of:

- `QUALIFIED`
- `HIGH_INTEREST`
- `MEETING_DONE`
- `RESALE_REQUEST`
- `DONE_DEAL`
- `ALREADY_BOUGHT`

All other final statuses are treated as **not converted**.

> Note: With more business input, the exact list of “conversion” statuses can be adjusted (e.g. only `DONE_DEAL` and `ALREADY_BOUGHT`).

## 4. Modeling Goal

- Input: lead-level information (lead metadata, campaign configuration, historical campaign performance).
- Output: a **conversion probability** and a **priority bucket** (High / Medium / Low) for each lead.


# 2) Data Loading & Preparation

In [6]:
# 2. Data Loading & Preparation

# 2.1 Load raw CSVs
campaign_leads = pd.read_csv("campaign_leads.csv")
campaigns = pd.read_csv("campaigns.csv")
insights = pd.read_csv("insights.csv")
lead_status_changes = pd.read_csv("lead_status_changes.csv")  # not heavily used in this first version

# 2.2 Basic cleaning / renaming
campaign_leads = campaign_leads.rename(columns={"id": "lead_id"})
campaigns = campaigns.rename(columns={"id": "campaign_id"})

# 2.3 Parse dates
campaign_leads["added_date"] = pd.to_datetime(campaign_leads["added_date"])
insights["created_at"] = pd.to_datetime(insights["created_at"])
lead_status_changes["created_at"] = pd.to_datetime(lead_status_changes["created_at"])

# Derive a pure date column (without time) where needed
campaign_leads["date"] = campaign_leads["added_date"].dt.date
insights["date"] = insights["created_at"].dt.date

# 2.4 Define conversion label based on final lead_status
conversion_statuses = [
    "DONE_DEAL",
    "ALREADY_BOUGHT",
    "RESALE_REQUEST",
    "MEETING_DONE",
    "HIGH_INTEREST",
    "QUALIFIED",
]

campaign_leads["is_converted"] = campaign_leads["lead_status"].isin(conversion_statuses).astype(int)

# 2.5 Join leads with campaign configuration
leads_cfg = campaign_leads.merge(
    campaigns[["campaign_id", "user_id", "project_name", "daily_budget"]],
    on="campaign_id",
    how="left"
)

# 2.6 Aggregate insights per campaign (historical performance)
insights_agg = (
    insights.groupby("campaign_id", as_index=False)
    .agg(
        hist_spend=("spend", "sum"),
        hist_clicks=("clicks", "sum"),
        hist_impr=("impressions", "sum")
    )
)

# Avoid division by zero
insights_agg["hist_ctr"] = np.where(
    insights_agg["hist_impr"] > 0,
    insights_agg["hist_clicks"] / insights_agg["hist_impr"],
    0.0
)
insights_agg["hist_cpc"] = np.where(
    insights_agg["hist_clicks"] > 0,
    insights_agg["hist_spend"] / insights_agg["hist_clicks"],
    0.0
)
insights_agg["hist_cpm"] = np.where(
    insights_agg["hist_impr"] > 0,
    insights_agg["hist_spend"] / insights_agg["hist_impr"] * 1000,
    0.0
)

# 2.7 Merge aggregated performance back to leads
leads_model = leads_cfg.merge(
    insights_agg,
    on="campaign_id",
    how="left"
)

# Replace missing performance with zeros (campaign with no insights)
for col in ["hist_spend", "hist_clicks", "hist_impr", "hist_ctr", "hist_cpc", "hist_cpm"]:
    leads_model[col] = leads_model[col].fillna(0.0)

# 2.8 Feature engineering: date parts & text lengths
leads_model["day_of_week"] = leads_model["added_date"].dt.weekday  # 0=Monday
leads_model["month"] = leads_model["added_date"].dt.month

leads_model["name_length"] = leads_model["name"].astype(str).str.len()
leads_model["email_length"] = leads_model["email"].astype(str).str.len()
leads_model["phone_length"] = leads_model["phone"].astype(str).str.len()

# Convert user_id to string for categorical encoding
leads_model["user_id_str"] = leads_model["user_id"].astype(str)

# Keep only rows with non-null label
leads_model = leads_model.dropna(subset=["is_converted"])

print("Modeling dataset shape:", leads_model.shape)
leads_model.head()


Modeling dataset shape: (56965, 24)


Unnamed: 0,lead_id,campaign_id,name,email,phone,lead_status,added_date,date,is_converted,user_id,project_name,daily_budget,hist_spend,hist_clicks,hist_impr,hist_ctr,hist_cpc,hist_cpm,day_of_week,month,name_length,email_length,phone_length,user_id_str
0,77930,6496,Vicky Mohr Sr.,smith.oren@example.org,+2015628437879,UNKNOWN,2024-06-01 17:00:40,2024-06-01,0,318,Il Cazar Safia north coast,2000,2137.86,156.0,8019.0,0.019454,13.704231,266.599327,5,6,14,22,14,318
1,77942,6496,Clovis Mueller,toy.korey@example.net,+1-615-909-5723,UNKNOWN,2024-06-01 18:22:00,2024-06-01,0,318,Il Cazar Safia north coast,2000,2137.86,156.0,8019.0,0.019454,13.704231,266.599327,5,6,14,21,15,318
2,77964,6493,Austin Ondricka II,hveum@example.org,+2014034035897,NOT_QUALIFIED,2024-06-01 21:56:33,2024-06-01,0,466,Azha North Coast Ras El Hekma,1200,689.19,418.0,6270.0,0.066667,1.64878,109.91866,5,6,18,17,14,466
3,77983,6500,Jaquan Kuhn,greenholt.elsa@example.com,310-242-6257,UNKNOWN,2024-06-02 01:13:19,2024-06-02,0,217,sky ad - new cairo launch,1200,842.4,62.0,1086.0,0.05709,13.587097,775.690608,6,6,11,26,12,217
4,77988,6496,Prof. Grayson Collier,obie83@example.com,+2016402961934,UNKNOWN,2024-06-02 01:41:56,2024-06-02,0,318,Il Cazar Safia north coast,2000,2137.86,156.0,8019.0,0.019454,13.704231,266.599327,6,6,21,18,14,318


# EDA for Modeling 

## 3. Exploratory Analysis of the Modeling Dataset

In this section we check:

- Class balance: how many converted vs. non-converted leads.
- Basic statistics of key numeric features.
- Sanity checks on important categorical features (project, user, lead_status).


In [7]:
# 3.1 Class balance
class_counts = leads_model["is_converted"].value_counts()
class_ratio = leads_model["is_converted"].value_counts(normalize=True) * 100

print("Class counts (0 = not converted, 1 = converted):")
print(class_counts)
print("\nClass ratio (%):")
print(class_ratio.round(2))

# 3.2 Basic stats for numeric features
numeric_cols_preview = [
    "daily_budget", "hist_spend", "hist_clicks", "hist_impr",
    "hist_ctr", "hist_cpc", "hist_cpm",
    "day_of_week", "month",
    "name_length", "email_length", "phone_length"
]

leads_model[numeric_cols_preview].describe().T

# 3.3 Quick look at top categories
print("\nTop projects:")
print(leads_model["project_name"].value_counts().head(10))

print("\nTop users (user_id):")
print(leads_model["user_id"].value_counts().head(10))

print("\nLead status distribution:")
print(leads_model["lead_status"].value_counts().head(10))


Class counts (0 = not converted, 1 = converted):
is_converted
0    52529
1     4436
Name: count, dtype: int64

Class ratio (%):
is_converted
0    92.21
1     7.79
Name: proportion, dtype: float64

Top projects:
project_name
Kings Way                        6023
Plage                            5746
Mountain View iCity October      4197
Sarai                            2319
Mountain View ICity New Cairo    1572
WonderMarQ - WaterMarQ           1388
Ora Solana West                  1095
cityscape 2023                   1087
solare - misr italia             1079
Palm Hills New Cairo             1066
Name: count, dtype: int64

Top users (user_id):
user_id
468     5953
199     5180
466     3450
411     1936
1243    1828
1019    1425
1535    1135
837      974
1005     901
365      900
Name: count, dtype: int64

Lead status distribution:
lead_status
UNKNOWN          27582
NEW_LEAD         14724
QUALIFIED         4063
NO_ANSWER         3124
NOT_QUALIFIED     2821
CALL_AGAIN         923
FOLLOW_

In [8]:
# 3.x – Check imbalance more explicitly

pos = leads_model["is_converted"].sum()
neg = len(leads_model) - pos

print(f"Total leads    : {len(leads_model):,}")
print(f"Positives (1)  : {pos:,}")
print(f"Negatives (0)  : {neg:,}")
print(f"Positive ratio : {pos / len(leads_model) * 100:.2f}%")



Total leads    : 56,965
Positives (1)  : 4,436
Negatives (0)  : 52,529
Positive ratio : 7.79%


# 4) Train/Test Split (90/10)

## 4. Train / Test Split (90/10)

We will:

- Build a feature matrix `X` and label vector `y`.
- Use a 90/10 split with stratification on the target to preserve class balance.


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    classification_report
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

RANDOM_STATE = 42

# =========================================
# 4) Train / Test Split (90/10)
# =========================================

# 4.1 Feature selection

numeric_features = [
    "daily_budget",
    "hist_spend", "hist_clicks", "hist_impr",
    "hist_ctr", "hist_cpc", "hist_cpm",
    "day_of_week", "month",
    "name_length", "email_length", "phone_length"
]

categorical_features = [
    "project_name",
    "lead_status",
    "user_id_str"
]

feature_cols = numeric_features + categorical_features

X = leads_model[feature_cols].copy()
y = leads_model["is_converted"].astype(int)

# 4.2 Train/test split – 90% train, 10% test with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.10,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# =========================================
# 5) Preprocessing Pipeline
# =========================================

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)



pos_train = y_train.sum()
neg_train = len(y_train) - pos_train
scale_pos_weight = neg_train / pos_train
print(f"scale_pos_weight for XGBoost (train only) = {scale_pos_weight:.2f}")

models = {
    "Baseline (Most Frequent)": DummyClassifier(
        strategy="most_frequent",
        random_state=RANDOM_STATE
    ),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        random_state=RANDOM_STATE,
        class_weight="balanced"   #
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        class_weight="balanced"  
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight  
    ),
    "AdaBoost": AdaBoostClassifier(
        n_estimators=200,
        learning_rate=0.1,
        random_state=RANDOM_STATE
    ),
}

results = []
fitted_models = {}
probas_test = {}

for name, clf in models.items():
    print(f"\n===== Training: {name} =====")
    
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", clf)
        ]
    )
    
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    if hasattr(pipe["model"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        y_proba = y_pred.astype(float)
    
    acc = accuracy_score(y_test, y_pred)
    try:
        roc = roc_auc_score(y_test, y_proba)
    except ValueError:
        roc = np.nan
    try:
        pr_auc = average_precision_score(y_test, y_proba)
    except ValueError:
        pr_auc = np.nan
    
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC : {roc:.4f}")
    print(f"PR AUC  : {pr_auc:.4f}")
    
    results.append({
        "model": name,
        "accuracy": acc,
        "roc_auc": roc,
        "pr_auc": pr_auc
    })
    
    fitted_models[name] = pipe
    probas_test[name] = y_proba

results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


Train shape: (51268, 15)
Test shape : (5697, 15)
scale_pos_weight for XGBoost (train only) = 11.84

===== Training: Baseline (Most Frequent) =====
Accuracy: 0.9221
ROC AUC : 0.5000
PR AUC  : 0.0779

===== Training: Logistic Regression =====
Accuracy: 1.0000
ROC AUC : 1.0000
PR AUC  : 1.0000

===== Training: Random Forest =====
Accuracy: 0.9986
ROC AUC : 1.0000
PR AUC  : 1.0000

===== Training: XGBoost =====
Accuracy: 1.0000
ROC AUC : 1.0000
PR AUC  : 1.0000

===== Training: AdaBoost =====
Accuracy: 0.9932
ROC AUC : 0.9996
PR AUC  : 0.9955


Unnamed: 0,model,accuracy,roc_auc,pr_auc
1,Logistic Regression,1.0,1.0,1.0
3,XGBoost,1.0,1.0,1.0
2,Random Forest,0.998596,1.0,1.0
4,AdaBoost,0.993154,0.999623,0.995516
0,Baseline (Most Frequent),0.922064,0.5,0.077936


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    classification_report
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

RANDOM_STATE = 42

# =========================================
# 4) Train / Test Split (90/10)
# =========================================

# 4.1 Feature selection

numeric_features = [
    "daily_budget",
    "hist_spend", "hist_clicks", "hist_impr",
    "hist_ctr", "hist_cpc", "hist_cpm",
    "day_of_week", "month",
    "name_length", "email_length", "phone_length"
]

categorical_features = [
    "project_name",
    "user_id_str"
]

feature_cols = numeric_features + categorical_features

X = leads_model[feature_cols].copy()
y = leads_model["is_converted"].astype(int)

# 4.2 Train/test split – 90% train, 10% test with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.10,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# =========================================
# 5) Preprocessing Pipeline
# =========================================

numeric_transformer = StandardScaler()

# لو عندك نسخة قديمة من سكيت-ليرن استخدم sparse بدل sparse_output
categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pos_train = y_train.sum()
neg_train = len(y_train) - pos_train
scale_pos_weight = neg_train / pos_train
print(f"scale_pos_weight for XGBoost (train only) = {scale_pos_weight:.2f}")

# =========================================
# 6) Models to compare
# =========================================

models = {
    "Baseline (Most Frequent)": DummyClassifier(
        strategy="most_frequent",
        random_state=RANDOM_STATE
    ),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        random_state=RANDOM_STATE,
        class_weight="balanced"
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        class_weight="balanced"
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight
    ),
    "AdaBoost": AdaBoostClassifier(
        n_estimators=200,
        learning_rate=0.1,
        random_state=RANDOM_STATE
    ),
}

results = []
fitted_models = {}
probas_test = {}

for name, clf in models.items():
    print(f"\n===== Training: {name} =====")
    
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", clf)
        ]
    )
    
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    if hasattr(pipe["model"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        y_proba = y_pred.astype(float)
    
    acc = accuracy_score(y_test, y_pred)
    try:
        roc = roc_auc_score(y_test, y_proba)
    except ValueError:
        roc = np.nan
    try:
        pr_auc = average_precision_score(y_test, y_proba)
    except ValueError:
        pr_auc = np.nan
    
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC : {roc:.4f}")
    print(f"PR AUC  : {pr_auc:.4f}")
    
    results.append({
        "model": name,
        "accuracy": acc,
        "roc_auc": roc,
        "pr_auc": pr_auc
    })
    
    fitted_models[name] = pipe
    probas_test[name] = y_proba

results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


Train shape: (51268, 14)
Test shape : (5697, 14)
scale_pos_weight for XGBoost (train only) = 11.84

===== Training: Baseline (Most Frequent) =====
Accuracy: 0.9221
ROC AUC : 0.5000
PR AUC  : 0.0779

===== Training: Logistic Regression =====
Accuracy: 0.7416
ROC AUC : 0.8657
PR AUC  : 0.3741

===== Training: Random Forest =====
Accuracy: 0.9082
ROC AUC : 0.8508
PR AUC  : 0.3360

===== Training: XGBoost =====
Accuracy: 0.8032
ROC AUC : 0.8736
PR AUC  : 0.3690

===== Training: AdaBoost =====
Accuracy: 0.9221
ROC AUC : 0.7353
PR AUC  : 0.2591


Unnamed: 0,model,accuracy,roc_auc,pr_auc
3,XGBoost,0.80323,0.873614,0.368966
1,Logistic Regression,0.741618,0.865741,0.374136
2,Random Forest,0.908197,0.850844,0.336034
4,AdaBoost,0.922064,0.735332,0.2591
0,Baseline (Most Frequent),0.922064,0.5,0.077936


## 7. Model selection and detailed evaluation

From the previous comparison, we select **XGBoost** as our primary model:
- It achieves the best ROC AUC (~0.90) and competitive PR AUC.
- Logistic Regression is kept as a strong, simple baseline.

In this section, we will:
- Extract predictions and probabilities from XGBoost on the test set.
- Compute a classification report (precision, recall, F1).
- Inspect the confusion matrix to understand the trade-off between false positives and false negatives.


In [11]:
from sklearn.metrics import confusion_matrix

# 7.1 Choose best model (XGBoost) and get predictions
best_model_name = "XGBoost"
best_pipe = fitted_models[best_model_name]
y_proba_best = probas_test[best_model_name]
y_pred_best = (y_proba_best >= 0.5).astype(int)  # default 0.5 threshold

print(f"Using model: {best_model_name}\n")

# 7.2 Classification report
print("Classification report (threshold = 0.5):\n")
print(classification_report(y_test, y_pred_best, digits=3))

# 7.3 Confusion matrix
cm = confusion_matrix(y_test, y_pred_best)
cm_df = pd.DataFrame(
    cm,
    index=["Actual 0 (Not converted)", "Actual 1 (Converted)"],
    columns=["Pred 0 (Not converted)", "Pred 1 (Converted)"]
)
cm_df


Using model: XGBoost

Classification report (threshold = 0.5):

              precision    recall  f1-score   support

           0      0.980     0.803     0.883      5253
           1      0.258     0.811     0.391       444

    accuracy                          0.803      5697
   macro avg      0.619     0.807     0.637      5697
weighted avg      0.924     0.803     0.844      5697



Unnamed: 0,Pred 0 (Not converted),Pred 1 (Converted)
Actual 0 (Not converted),4216,1037
Actual 1 (Converted),84,360


## 8. Lift and decile analysis

To show business value, we want to answer:

> “If sales only contact the top 10% of leads ranked by the model, what share of total conversions do they cover?”

Steps:
- Build a test-set dataframe with the true label and the model score.
- Rank leads by score (highest to lowest).
- Split them into 10 equal-sized buckets (deciles).
- For each decile, compute number of leads, number of conversions, conversion rate, cumulative conversions, and lift vs. the overall average.


In [12]:
# 8.1 Build evaluation DataFrame on test set
eval_df = pd.DataFrame({
    "y_true": y_test.values,
    "score": y_proba_best
}, index=y_test.index).copy()

# Rank by score (highest first) and create deciles 1..10 (1 = highest score)
eval_df = eval_df.sort_values("score", ascending=False)
eval_df["rank"] = np.arange(1, len(eval_df) + 1)

eval_df["decile"] = pd.qcut(
    eval_df["rank"],
    10,
    labels=range(1, 11)  # 1 = top 10%, 10 = bottom 10%
)

# 8.2 Aggregate by decile
overall_conv_rate = eval_df["y_true"].mean()

decile_summary = (
    eval_df
    .groupby("decile")
    .agg(
        leads=("y_true", "size"),
        conversions=("y_true", "sum"),
        conv_rate=("y_true", "mean"),
        min_score=("score", "min"),
        max_score=("score", "max")
    )
    .sort_index()  # decile 1 -> 10
)

# cumulative metrics (from best decile downwards)
decile_summary["cum_conversions"] = decile_summary["conversions"].cumsum()
total_conversions = decile_summary["conversions"].sum()
decile_summary["cum_perc_conversions"] = (
    decile_summary["cum_conversions"] / total_conversions * 100
)

# lift vs global conversion rate
decile_summary["lift"] = decile_summary["conv_rate"] / overall_conv_rate

decile_summary


  .groupby("decile")


Unnamed: 0_level_0,leads,conversions,conv_rate,min_score,max_score,cum_conversions,cum_perc_conversions,lift
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,570,195,0.342105,0.774006,0.985849,195,43.918919,4.38958
2,570,131,0.229825,0.59088,0.771019,326,73.423423,2.948898
3,569,62,0.108963,0.410388,0.59088,388,87.387387,1.398114
4,570,24,0.042105,0.319281,0.410388,412,92.792793,0.540256
5,570,10,0.017544,0.257224,0.319222,422,95.045045,0.225107
6,569,12,0.02109,0.181518,0.257224,434,97.747748,0.270603
7,570,8,0.014035,0.12249,0.181518,442,99.54955,0.180085
8,569,2,0.003515,0.078623,0.122378,444,100.0,0.0451
9,570,0,0.0,0.038166,0.078623,444,100.0,0.0
10,570,0,0.0,0.002344,0.037935,444,100.0,0.0


In [13]:
top10_share = decile_summary.loc[1, "cum_perc_conversions"]
print(f"Top 10% of leads (decile 1) contain ~{top10_share:.1f}% of all conversions in the test set.")


Top 10% of leads (decile 1) contain ~43.9% of all conversions in the test set.


## 9. Lead priority buckets and example leads

To make this actionable for sales, we convert model scores into simple priority buckets:

- **High priority**: deciles 1–2 (top 20% of scores)
- **Medium priority**: deciles 3–6
- **Low priority**: deciles 7–10

In production, the CRM / dashboard would show a simple *priority tag* next to each lead, powered by the model score.


In [14]:
# 9.1 Map deciles to priority buckets
def map_priority(decile):
    d = int(decile)
    if d <= 2:
        return "High"
    elif d <= 6:
        return "Medium"
    else:
        return "Low"

eval_df["priority"] = eval_df["decile"].apply(map_priority)

# 9.2 Join back some lead metadata from leads_model (using the test indices)
test_meta = leads_model.loc[eval_df.index, [
    "lead_id", "project_name", "user_id", "added_date"
]].copy()

priority_view = pd.concat([test_meta, eval_df[["score", "decile", "priority", "y_true"]]], axis=1)
priority_view = priority_view.sort_values("score", ascending=False)

# show a few examples
priority_view.head(15)


Unnamed: 0,lead_id,project_name,user_id,added_date,score,decile,priority,y_true
14088,99975,Veranda Sahl Hasheesh,993,2024-07-30 14:07:07,0.985849,1,High,1
14085,99972,Veranda Sahl Hasheesh,993,2024-07-30 13:58:12,0.985599,1,High,1
14962,101265,Veranda Sahl Hasheesh,993,2024-08-03 18:53:53,0.98533,1,High,1
14860,101053,Veranda Sahl Hasheesh,993,2024-08-03 01:14:23,0.98533,1,High,1
14631,100676,Veranda Sahl Hasheesh,993,2024-08-01 22:28:18,0.98533,1,High,1
15598,103008,Veranda Sahl Hasheesh,993,2024-08-10 18:34:40,0.985098,1,High,1
15398,102291,Veranda Sahl Hasheesh,993,2024-08-07 17:16:36,0.985071,1,High,1
13465,99113,Veranda Sahl Hasheesh,993,2024-07-27 21:06:29,0.984975,1,High,1
14833,101004,Veranda Sahl Hasheesh,993,2024-08-02 22:38:52,0.984904,1,High,1
13258,98746,Veranda Sahl Hasheesh,993,2024-07-27 03:32:22,0.984807,1,High,1


In [15]:
priority_summary = (
    priority_view
    .groupby("priority")
    .agg(
        leads=("y_true", "size"),
        conversions=("y_true", "sum"),
        conv_rate=("y_true", "mean")
    )
)
priority_summary["conv_rate"] = (priority_summary["conv_rate"] * 100).round(2)
priority_summary


Unnamed: 0_level_0,leads,conversions,conv_rate
priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,1140,326,28.6
Low,2279,10,0.44
Medium,2278,108,4.74


## 10. Assumptions and clarifying questions

### Conversion definition

- A lead is considered **converted** if its final recorded status is in:
  `["DONE_DEAL", "ALREADY_BOUGHT", "RESALE_REQUEST", "MEETING_DONE", "HIGH_INTEREST", "QUALIFIED"]`.
- We assume these statuses are only assigned **after** a meaningful sales outcome (deal closed, strong intent, or qualified lead).
- We treat the latest status per lead as the ground truth and ignore intermediate status changes.

### Data and modeling assumptions

- The training data is representative of future behaviour (no major changes in marketing strategy or sales process).
- Features like `historical spend/clicks/impressions` are already aggregated at campaign/project level and are available at lead creation time.
- Date-based features (month, day of week) capture seasonality effects but we did not model long-term trends explicitly.
- We removed `lead_status` from the feature set to avoid **target leakage** (the label is derived from this field).

### Business questions to clarify in a real project

1. **Exact conversion definition**  
   - Should we only treat `DONE_DEAL` as a win, or also include `HIGH_INTEREST` / `QUALIFIED`?
   - Are there any “fake” deals (test transactions, internal leads) that should be excluded?

2. **Time window for conversion**  
   - After how many days do we consider a lead “dead”?  
   - Do we need a time-to-conversion forecast (e.g. convert in the next 14 days) instead of “ever converted”?

3. **Sales capacity and SLA**  
   - How many calls/emails can the sales team realistically handle per day?  
   - Should we optimise for **precision in the top bucket** (High priority) or for recall overall?

4. **Integration into workflow**  
   - Where will the score / priority show up? CRM? WhatsApp integration?  
   - Do we need an API or a batch scoring job (e.g. daily scoring of new leads)?

5. **Monitoring and retraining**  
   - How often should we retrain the model (monthly / quarterly)?  
   - What KPIs will we track to detect model drift (overall conversion rate, lift in top deciles, etc.)?


 ## 11. How the model would work in production (business view)

**Inputs per lead (available at or near creation time):**

- Lead-level features:
  - Name / email / phone length (proxy for data completeness / seriousness)
  - Date features: month, day of week the lead was created
- Campaign / project context:
  - `project_name`
  - `user_id` (customer account / advertiser)
- Historical performance signals for that campaign or project:
  - `daily_budget`
  - Historical spend, clicks, impressions
  - Derived ratios: CTR, CPC, CPM

**Model output:**

- A **conversion probability** between 0 and 1 for each new lead.
- A simple **priority tag** based on that score:
  - **High priority** → top 20% of scores
  - **Medium priority** → next 40%
  - **Low priority** → bottom 40%

**How sales would use it:**

- Every morning, new leads are scored.
- Sales reps start with **High priority** leads first (best chance to convert).
- Managers can monitor:
  - Conversion rate per priority bucket
  - Volume and quality of leads per project / campaign
- Over time, we can tune thresholds (e.g., only top 10% = High) based on sales capacity and business preferences.


In [17]:
from datetime import date

if "added_date" in leads_model.columns:
    min_date = leads_model["added_date"].min().date()
    max_date = leads_model["added_date"].max().date()
elif "date" in leads_model.columns:
    min_date = pd.to_datetime(leads_model["date"]).min().date()
    max_date = pd.to_datetime(leads_model["date"]).max().date()
else:
    min_date = date(2024, 1, 1)
    max_date = date(2024, 12, 31)

print("min_date:", min_date)
print("max_date:", max_date)


min_date: 2024-06-01
max_date: 2025-10-09


# Joblib 

In [20]:
from joblib import dump
from datetime import date

# =========================================
# 1) Compute min_date / max_date from leads_model
# =========================================
if "added_date" in leads_model.columns:
    min_date = leads_model["added_date"].min().date()
    max_date = leads_model["added_date"].max().date()
elif "date" in leads_model.columns:
    min_date = pd.to_datetime(leads_model["date"]).min().date()
    max_date = pd.to_datetime(leads_model["date"]).max().date()
else:
    # Fallback 
    min_date = date(2024, 1, 1)
    max_date = date(2024, 12, 31)

print("min_date:", min_date)
print("max_date:", max_date)

# =========================================
# 2) Force champion model = XGBoost
# =========================================
# 
if "XGBoost" not in fitted_models:
    raise ValueError("XGBoost model is not found in fitted_models dict.")

best_model_name = "XGBoost"
champion = fitted_models[best_model_name]

# 
xgb_row = results_df[results_df["model"] == best_model_name].iloc[0]

print("Champion model (forced):", best_model_name)
print(xgb_row[["accuracy", "roc_auc", "pr_auc"]])

# =========================================
# 3) Info for Streamlit artifact
# =========================================
neg_count = (leads_model["is_converted"] == 0).sum()
pos_count = (leads_model["is_converted"] == 1).sum()

artifact = {
    # pipeline (preprocessor + XGBoost)
    "model": champion,
    
   
    "best_model_name": best_model_name,
    
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "feature_cols": feature_cols,
    
    "conversion_statuses": conversion_statuses,
    
    "min_date": min_date,
    "max_date": max_date,
    
    # class balance
    "class_balance": {
        "neg": int(neg_count),
        "pos": int(pos_count),
    },
    
    "metrics": results_df,
    
    "project_options": sorted(
        leads_model["project_name"].dropna().unique().tolist()
    ),
    "user_options": sorted(
        leads_model["user_id_str"].dropna().unique().tolist()
    ),
    
    "median_numeric": leads_model[numeric_features].median().to_dict(),
}

# =========================================
# 4) Save artifact
# =========================================
dump(artifact, "leadsmart_champion.joblib")

print("Saved artifact to leadsmart_champion.joblib")


min_date: 2024-06-01
max_date: 2025-10-09
Champion model (forced): XGBoost
accuracy     0.80323
roc_auc     0.873614
pr_auc      0.368966
Name: 3, dtype: object
Saved artifact to leadsmart_champion.joblib ✅


In [21]:
import os

print(os.getcwd())
!dir *.joblib


C:\Users\LOQ\Downloads\task hamada\test_data
 Volume in drive C has no label.
 Volume Serial Number is 444E-EEC7

 Directory of C:\Users\LOQ\Downloads\task hamada\test_data

12/08/2025  06:22 AM           423,525 leadsmart_champion.joblib
               1 File(s)        423,525 bytes
               0 Dir(s)  41,760,702,464 bytes free
