In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/skygeni_sales_data.csv")

In [14]:
df.columns

Index(['deal_id', 'created_date', 'closed_date', 'sales_rep_id', 'industry',
       'region', 'product_type', 'lead_source', 'deal_stage', 'deal_amount',
       'sales_cycle_days', 'outcome'],
      dtype='str')

In [4]:
df.shape

(5000, 12)

In [5]:
df.head()

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
0,D00001,2023-11-24,2023-12-15,rep_22,SaaS,North America,Enterprise,Referral,Qualified,4253,21,Won
1,D00002,2023-01-17,2023-01-27,rep_7,SaaS,India,Core,Referral,Closed,3905,10,Won
2,D00003,2023-10-29,2023-12-10,rep_5,HealthTech,APAC,Core,Inbound,Proposal,10615,42,Lost
3,D00004,2023-07-14,2023-08-02,rep_18,FinTech,India,Core,Partner,Negotiation,4817,19,Won
4,D00005,2024-02-29,2024-05-26,rep_2,HealthTech,APAC,Core,Outbound,Qualified,45203,87,Lost


In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deal_id           5000 non-null   str  
 1   created_date      5000 non-null   str  
 2   closed_date       5000 non-null   str  
 3   sales_rep_id      5000 non-null   str  
 4   industry          5000 non-null   str  
 5   region            5000 non-null   str  
 6   product_type      5000 non-null   str  
 7   lead_source       5000 non-null   str  
 8   deal_stage        5000 non-null   str  
 9   deal_amount       5000 non-null   int64
 10  sales_cycle_days  5000 non-null   int64
 11  outcome           5000 non-null   str  
dtypes: int64(2), str(10)
memory usage: 468.9 KB


In [7]:
df.describe(include="all")

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
count,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000.0,5000.0,5000
unique,5000,451,547,25,5,4,3,4,5,,,2
top,D00001,2023-03-08,2023-09-20,rep_20,Ecommerce,India,Core,Inbound,Demo,,,Lost
freq,1,22,22,230,1060,1286,1694,1262,1043,,,2737
mean,,,,,,,,,,26286.4928,63.7518,
std,,,,,,,,,,27689.230136,32.731405,
min,,,,,,,,,,2002.0,7.0,
25%,,,,,,,,,,6611.0,35.75,
50%,,,,,,,,,,14171.5,64.0,
75%,,,,,,,,,,39062.25,92.0,


In [16]:
# 1. Convert dates to datetime 
df["created_date"] = pd.to_datetime(df["created_date"])

In [18]:
df["closed_date"] = pd.to_datetime(df["closed_date"])

In [19]:
# 2. Make outcome a categorical with clear order (Lost/Won)
df["outcome"] = df["outcome"].astype("category")

In [20]:
# 3. Sanity-check ranges for numeric columns
df[["deal_amount", "sales_cycle_days"]].describe()

Unnamed: 0,deal_amount,sales_cycle_days
count,5000.0,5000.0
mean,26286.4928,63.7518
std,27689.230136,32.731405
min,2002.0,7.0
25%,6611.0,35.75
50%,14171.5,64.0
75%,39062.25,92.0
max,100000.0,120.0


**Overall win rate**


In [21]:
win_rate = (df["outcome"] == "Won").mean()

**Count of Won/Lost**

In [24]:
outcome_counts = df["outcome"].value_counts()
win_rate, outcome_counts

(np.float64(0.4526),
 outcome
 Lost    2737
 Won     2263
 Name: count, dtype: int64)

### Overall win rate

Out of 5,000 opportunities, the team won 2,263 and lost 2,737, resulting in an overall win rate of approximately 45%.


In [26]:
# Win rate by industry
win_rate_by_industry = df.groupby("industry")["outcome"].apply(lambda x: (x == "Won").mean()).sort_values(ascending=False)

In [27]:
# Win rate by region
win_rate_by_region = df.groupby("region")["outcome"].apply(lambda x: (x == "Won").mean()).sort_values(ascending=False)

In [28]:
win_rate_by_industry, win_rate_by_region

(industry
 FinTech       0.477054
 SaaS          0.451548
 Ecommerce     0.449057
 HealthTech    0.445545
 EdTech        0.441532
 Name: outcome, dtype: float64,
 region
 India            0.457232
 Europe           0.455799
 APAC             0.449275
 North America    0.447942
 Name: outcome, dtype: float64)

Looking at performance by industry, win rates range from about 44% to 48%, with FinTech leading and EdTech at the lower end. By region, win rates are tightly clustered between roughly 45% and 46%, with India and Europe slightly ahead of APAC and North America.

### Win rate by industry

| Industry   | Win rate |
|-----------|----------|
| FinTech   | 47.7%    |
| SaaS      | 45.2%    |
| Ecommerce | 44.9%    |
| HealthTech| 44.6%    |
| EdTech    | 44.2%    |

### Win rate by region

| Region        | Win rate |
|--------------|----------|
| India        | 45.7%    |
| Europe       | 45.6%    |
| APAC         | 44.9%    |
| North America| 44.8%    |



FinTech has the highest win rate at about 48%, while EdTech is lower at around 44%, indicating stronger positioning in financial services than in education.
Across regions, win rates are fairly similar (about 45â€“46%), with India and Europe slightly ahead, suggesting no single region is dramatically underperforming.