In [3]:
import pandas as pd
import numpy as np  # make sure this is imported

df = pd.read_csv("../data/skygeni_sales_data.csv")

# Target
df["won_flag"] = (df["outcome"] == "Won").astype(int)

# Basic numeric features
df["log_dealamount"] = (df["deal_amount"] + 1).apply(np.log)
df["salescycle_bin"] = pd.qcut(df["sales_cycle_days"], q=4, labels=False)  # 0–3 quartiles

# Categorical features we’ll use
cat_cols = ["industry", "region", "lead_source", "deal_stage"]

# Quick check of column names
print(df.columns)
print(df[["deal_amount", "sales_cycle_days"]].head())


Index(['deal_id', 'created_date', 'closed_date', 'sales_rep_id', 'industry',
       'region', 'product_type', 'lead_source', 'deal_stage', 'deal_amount',
       'sales_cycle_days', 'outcome', 'won_flag', 'log_dealamount',
       'salescycle_bin'],
      dtype='str')
   deal_amount  sales_cycle_days
0         4253                21
1         3905                10
2        10615                42
3         4817                19
4        45203                87


In [5]:
feature_cols_num = ["log_dealamount", "sales_cycle_days"]
feature_cols_cat = ["industry", "region", "lead_source", "deal_stage"]

In [7]:
# Win rate by industry
wr_industry = df.groupby("industry")["won_flag"].mean().sort_values(ascending=False)
print("Win rate by industry:")
print(wr_industry)

# Win rate by region
wr_region = df.groupby("region")["won_flag"].mean().sort_values(ascending=False)
print("\nWin rate by region:")
print(wr_region)

# Win rate by lead source
wr_lead = df.groupby("lead_source")["won_flag"].mean().sort_values(ascending=False)
print("\nWin rate by lead source:")
print(wr_lead)

# Win rate by deal stage at close
wr_stage = df.groupby("deal_stage")["won_flag"].mean().sort_values(ascending=False)
print("\nWin rate by deal stage:")
print(wr_stage)


Win rate by industry:
industry
FinTech       0.477054
SaaS          0.451548
Ecommerce     0.449057
HealthTech    0.445545
EdTech        0.441532
Name: won_flag, dtype: float64

Win rate by region:
region
India            0.457232
Europe           0.455799
APAC             0.449275
North America    0.447942
Name: won_flag, dtype: float64

Win rate by lead source:
lead_source
Inbound     0.460380
Referral    0.455272
Outbound    0.455056
Partner     0.439516
Name: won_flag, dtype: float64

Win rate by deal stage:
deal_stage
Closed         0.467402
Negotiation    0.466332
Demo           0.458293
Proposal       0.446977
Qualified      0.422594
Name: won_flag, dtype: float64


In [10]:
segment_summary = (
    df.groupby(["industry", "region", "lead_source"])
      .agg(
          deals=("won_flag", "size"),
          win_rate=("won_flag", "mean"),
          avg_amount=("deal_amount", "mean")
      )
      .reset_index()
)

segment_summary["driver_score"] = (
    segment_summary["win_rate"] * segment_summary["avg_amount"]
)

segment_summary.sort_values("driver_score", ascending=False).head(20)


Unnamed: 0,industry,region,lead_source,deals,win_rate,avg_amount,driver_score
44,FinTech,North America,Inbound,59,0.610169,30933.474576,18874.662453
21,EdTech,Europe,Outbound,65,0.569231,31552.738462,17960.789586
37,FinTech,Europe,Outbound,44,0.568182,29474.954545,16747.133264
75,SaaS,India,Referral,51,0.509804,31112.745098,15861.399462
65,SaaS,APAC,Outbound,56,0.464286,34042.625,15805.504464
34,FinTech,APAC,Partner,63,0.52381,29630.746032,15520.866969
0,Ecommerce,APAC,Inbound,65,0.492308,31295.6,15407.064615
31,EdTech,North America,Referral,69,0.536232,28040.15942,15036.027515
27,EdTech,India,Referral,56,0.535714,28031.410714,15016.827168
23,EdTech,Europe,Referral,67,0.447761,33512.955224,15005.800847


In [12]:
# -----------------------------
# 1. Segment-level win-rate "model" (rule-based)
# -----------------------------
# We use industry, region, lead_source, deal_stage as the driver dimensions
segment_cols = ["industry", "region", "lead_source", "deal_stage"]

# Compute historical win rates and volume per segment
seg_rules = (
    df.groupby(segment_cols)
      .agg(
          deals=("won_flag", "size"),
          win_rate=("won_flag", "mean")
      )
      .reset_index()
)

baseline_wr = df["won_flag"].mean()
print(f"Overall baseline win rate: {baseline_wr:.3f}")
seg_rules.head()


Overall baseline win rate: 0.453


Unnamed: 0,industry,region,lead_source,deal_stage,deals,win_rate
0,Ecommerce,APAC,Inbound,Closed,13,0.461538
1,Ecommerce,APAC,Inbound,Demo,13,0.538462
2,Ecommerce,APAC,Inbound,Negotiation,9,0.666667
3,Ecommerce,APAC,Inbound,Proposal,16,0.6875
4,Ecommerce,APAC,Inbound,Qualified,14,0.142857


In [13]:
# -----------------------------
# 2. Apply rules to score each deal
# -----------------------------
df_scored = df.merge(
    seg_rules,
    on=segment_cols,
    how="left",
    suffixes=("", "_seg")
)

df_scored["seg_win_prob"] = df_scored["win_rate"].fillna(baseline_wr)

# Simple risk bands based on segment win probability
def risk_band(p):
    if p >= baseline_wr + 0.05:
        return "low_risk_high_fit"
    elif p <= baseline_wr - 0.05:
        return "high_risk_low_fit"
    else:
        return "medium_risk"

df_scored["risk_band"] = df_scored["seg_win_prob"].apply(risk_band)

df_scored[[
    "deal_id", "industry", "region", "lead_source", "deal_stage",
    "seg_win_prob", "risk_band"
]].head()


Unnamed: 0,deal_id,industry,region,lead_source,deal_stage,seg_win_prob,risk_band
0,D00001,SaaS,North America,Referral,Qualified,0.5,medium_risk
1,D00002,SaaS,India,Referral,Closed,0.888889,low_risk_high_fit
2,D00003,HealthTech,APAC,Inbound,Proposal,0.5,medium_risk
3,D00004,FinTech,India,Partner,Negotiation,0.416667,medium_risk
4,D00005,HealthTech,APAC,Outbound,Qualified,0.416667,medium_risk


In [14]:
# -----------------------------
# 3. Segment-level driver table (for write-up)
# -----------------------------
segment_cols_driver = ["industry", "region", "lead_source"]

driver = (
    df.groupby(segment_cols_driver)
      .agg(
          deals=("won_flag", "size"),
          win_rate=("won_flag", "mean"),
          avg_amount=("deal_amount", "mean")
      )
      .reset_index()
)

driver["lift_vs_baseline"] = driver["win_rate"] - baseline_wr
driver["driver_score"] = driver["win_rate"] * driver["avg_amount"]

driver_sorted = driver.sort_values("driver_score", ascending=False)

print("Top 20 segments by driver_score (win rate × avg deal size):")
driver_sorted.head(20)


Top 20 segments by driver_score (win rate × avg deal size):


Unnamed: 0,industry,region,lead_source,deals,win_rate,avg_amount,lift_vs_baseline,driver_score
44,FinTech,North America,Inbound,59,0.610169,30933.474576,0.157569,18874.662453
21,EdTech,Europe,Outbound,65,0.569231,31552.738462,0.116631,17960.789586
37,FinTech,Europe,Outbound,44,0.568182,29474.954545,0.115582,16747.133264
75,SaaS,India,Referral,51,0.509804,31112.745098,0.057204,15861.399462
65,SaaS,APAC,Outbound,56,0.464286,34042.625,0.011686,15805.504464
34,FinTech,APAC,Partner,63,0.52381,29630.746032,0.07121,15520.866969
0,Ecommerce,APAC,Inbound,65,0.492308,31295.6,0.039708,15407.064615
31,EdTech,North America,Referral,69,0.536232,28040.15942,0.083632,15036.027515
27,EdTech,India,Referral,56,0.535714,28031.410714,0.083114,15016.827168
23,EdTech,Europe,Referral,67,0.447761,33512.955224,-0.004839,15005.800847


In [15]:
# -----------------------------
# 4. Human-readable insight cards
# -----------------------------
top_segments = driver_sorted.head(5).copy()
bottom_segments = driver_sorted.sort_values("driver_score").head(5).copy()

cards = []

for _, row in top_segments.iterrows():
    cards.append(
        f"[UPLIFT] {row['industry']} – {row['region']} via {row['lead_source']}: "
        f"win rate {row['win_rate']:.1%} on {int(row['deals'])} deals, "
        f"avg deal {row['avg_amount']:.0f}. "
        f"Lift vs baseline {baseline_wr:.1%} = {row['lift_vs_baseline']:.1%}."
    )

for _, row in bottom_segments.iterrows():
    cards.append(
        f"[RISK] {row['industry']} – {row['region']} via {row['lead_source']}: "
        f"win rate {row['win_rate']:.1%} on {int(row['deals'])} deals, "
        f"avg deal {row['avg_amount']:.0f}. "
        f"Lift vs baseline {baseline_wr:.1%} = {row['lift_vs_baseline']:.1%}."
    )

for c in cards:
    print(c)


[UPLIFT] FinTech – North America via Inbound: win rate 61.0% on 59 deals, avg deal 30933. Lift vs baseline 45.3% = 15.8%.
[UPLIFT] EdTech – Europe via Outbound: win rate 56.9% on 65 deals, avg deal 31553. Lift vs baseline 45.3% = 11.7%.
[UPLIFT] FinTech – Europe via Outbound: win rate 56.8% on 44 deals, avg deal 29475. Lift vs baseline 45.3% = 11.6%.
[UPLIFT] SaaS – India via Referral: win rate 51.0% on 51 deals, avg deal 31113. Lift vs baseline 45.3% = 5.7%.
[UPLIFT] SaaS – APAC via Outbound: win rate 46.4% on 56 deals, avg deal 34043. Lift vs baseline 45.3% = 1.2%.
[RISK] SaaS – Europe via Referral: win rate 37.7% on 61 deals, avg deal 16474. Lift vs baseline 45.3% = -7.6%.
[RISK] EdTech – North America via Partner: win rate 32.3% on 62 deals, avg deal 21569. Lift vs baseline 45.3% = -13.0%.
[RISK] FinTech – Europe via Partner: win rate 33.3% on 54 deals, avg deal 21036. Lift vs baseline 45.3% = -11.9%.
[RISK] EdTech – Europe via Inbound: win rate 27.4% on 73 deals, avg deal 28688. L

In [16]:
# -----------------------------
# Insight cards from driver table
# -----------------------------
top_segments = driver_sorted.head(5).copy()
bottom_segments = driver_sorted.sort_values("driver_score").head(5).copy()

cards = []

for _, row in top_segments.iterrows():
    cards.append(
        f"[UPLIFT] {row['industry']} – {row['region']} via {row['lead_source']}: "
        f"win rate {row['win_rate']:.1%} on {int(row['deals'])} deals, "
        f"avg deal {row['avg_amount']:.0f}. "
        f"Lift vs baseline {baseline_wr:.1%} = {row['lift_vs_baseline']:.1%}."
    )

for _, row in bottom_segments.iterrows():
    cards.append(
        f"[RISK] {row['industry']} – {row['region']} via {row['lead_source']}: "
        f"win rate {row['win_rate']:.1%} on {int(row['deals'])} deals, "
        f"avg deal {row['avg_amount']:.0f}. "
        f"Lift vs baseline {baseline_wr:.1%} = {row['lift_vs_baseline']:.1%}."
    )

for c in cards:
    print(c)


[UPLIFT] FinTech – North America via Inbound: win rate 61.0% on 59 deals, avg deal 30933. Lift vs baseline 45.3% = 15.8%.
[UPLIFT] EdTech – Europe via Outbound: win rate 56.9% on 65 deals, avg deal 31553. Lift vs baseline 45.3% = 11.7%.
[UPLIFT] FinTech – Europe via Outbound: win rate 56.8% on 44 deals, avg deal 29475. Lift vs baseline 45.3% = 11.6%.
[UPLIFT] SaaS – India via Referral: win rate 51.0% on 51 deals, avg deal 31113. Lift vs baseline 45.3% = 5.7%.
[UPLIFT] SaaS – APAC via Outbound: win rate 46.4% on 56 deals, avg deal 34043. Lift vs baseline 45.3% = 1.2%.
[RISK] SaaS – Europe via Referral: win rate 37.7% on 61 deals, avg deal 16474. Lift vs baseline 45.3% = -7.6%.
[RISK] EdTech – North America via Partner: win rate 32.3% on 62 deals, avg deal 21569. Lift vs baseline 45.3% = -13.0%.
[RISK] FinTech – Europe via Partner: win rate 33.3% on 54 deals, avg deal 21036. Lift vs baseline 45.3% = -11.9%.
[RISK] EdTech – Europe via Inbound: win rate 27.4% on 73 deals, avg deal 28688. L