In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [115]:
df = pd.read_csv('final_cleaned_data.csv')

In [116]:
df.head(5)

Unnamed: 0,Learner SignUp DateTime,Opportunity Name,Opportunity Category,Opportunity End Date,First Name,Date of Birth,Gender,Country,Institution Name,Current/Intended Major,Entry created at,Status Description,Apply Date,Opportunity Start Date,Age_at_Application_Int,Institution_Participation_Count,continent,days_to_apply,invalid_days_to_apply
0,2023-06-14,career essentials: getting started with your p...,course,2024-06-29,faria,2001-12-01,female,pakistan,nwihs,radiology,2024-11-03,started,2023-06-14,2022-03-11,21,1,Asia,0.0,0
1,2023-01-05,career essentials: getting started with your p...,course,2024-06-29,poojitha,2000-08-16,female,india,saint louis university,information systems,2024-11-03,started,2023-01-05,2022-03-11,22,4423,Asia,0.0,0
2,2023-09-04,career essentials: getting started with your p...,course,2024-06-29,emmanuel,2002-01-27,male,united states,illinois institute of technology,computer science,2024-11-03,started,2023-11-05,2022-03-11,21,160,North America,62.0,0
3,2023-08-29,career essentials: getting started with your p...,course,2024-06-29,amrutha varshini,1999-01-11,female,united states,saint louis university,information systems,2024-11-03,team allocated,2023-09-10,2022-03-11,24,4423,North America,12.0,0
4,2023-06-01,career essentials: getting started with your p...,course,2024-06-29,vinay varshith,2000-04-19,male,united states,saint louis university,computer science,2024-11-03,started,2023-06-01,2022-03-11,23,4423,North America,0.0,0


In [117]:
df['Status Description'].unique()

array(['started', 'team allocated', 'waitlisted', 'withdraw',
       'rewards award', 'dropped out', 'rejected', 'applied'],
      dtype=object)

In [118]:
def decide_churn(status):
    if status in ['dropped out', 'withdraw', 'rejected']:
        return 1   # churn
    elif status in ['team allocated', 'started', 'rewards award']:
        return 0   # not churn
    else:
        return None  # unknown outcome (waitlisted, applied)

df['churn'] = df['Status Description'].apply(decide_churn)

# remove rows with unknown outcome
df = df[df['churn'].notna()]


1️⃣ Non-churn (0) → student is successful / active

- team allocated → student is accepted and working

- started → student is active

- rewards award → student completed successfully

2️⃣ Churn (1) → student dropped / failed to progress

- dropped out → left after starting

- withdraw → voluntarily left

- rejected → failed to enter

3️⃣ Unknown outcome → remove from training

- waitlisted → outcome not decided

- applied → too early to judge

In [119]:
df['churn'].value_counts()


Unnamed: 0_level_0,count
churn,Unnamed: 1_level_1
1.0,4262
0.0,4065


In [120]:
df.drop(columns=['Status Description','days_to_apply','Date of Birth','First Name','invalid_days_to_apply'], inplace=True)

In [121]:
df.drop(columns=['Opportunity End Date'], inplace=True)

✅ Recommended features
Demographic

- Age_at_Application_Int

- Gender

- Country

- continent

- Academic / engagement

Opportunity Category

- Institution_Participation_Count

- days_to_apply

- invalid_days_to_apply

Timing behavior

- Learner SignUp DateTime

- Apply Date

- Opportunity Start Date

In [122]:
cat_cols = [
    'Gender', 'Country', 'continent',
    'Opportunity Category', 'Institution Name'
]


In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8327 entries, 0 to 8540
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Learner SignUp DateTime          8327 non-null   object 
 1   Opportunity Name                 8327 non-null   object 
 2   Opportunity Category             8327 non-null   object 
 3   Gender                           8327 non-null   object 
 4   Country                          8327 non-null   object 
 5   Institution Name                 8327 non-null   object 
 6   Current/Intended Major           8327 non-null   object 
 7   Entry created at                 8327 non-null   object 
 8   Apply Date                       8327 non-null   object 
 9   Opportunity Start Date           4755 non-null   object 
 10  Age_at_Application_Int           8327 non-null   int64  
 11  Institution_Participation_Count  8327 non-null   int64  
 12  continent                

In [124]:
df['signup_month'] = pd.to_datetime(df['Learner SignUp DateTime']).dt.month
df['signup_weekday'] = pd.to_datetime(df['Learner SignUp DateTime']).dt.weekday
df.drop(columns=['Learner SignUp DateTime'], inplace=True)



df.drop(columns=['Opportunity Start Date'], inplace=True)



converting datetime → behavior

In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8327 entries, 0 to 8540
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   Opportunity Name                 8327 non-null   object        
 1   Opportunity Category             8327 non-null   object        
 2   Gender                           8327 non-null   object        
 3   Country                          8327 non-null   object        
 4   Institution Name                 8327 non-null   object        
 5   Current/Intended Major           8327 non-null   object        
 6   Entry created at                 8327 non-null   datetime64[ns]
 7   Apply Date                       8327 non-null   datetime64[ns]
 8   Age_at_Application_Int           8327 non-null   int64         
 9   Institution_Participation_Count  8327 non-null   int64         
 10  continent                        8327 non-null   object        
 

In [127]:
df['Entry created at'] = pd.to_datetime(df['Entry created at'])
df['Apply Date'] = pd.to_datetime(df['Apply Date'])


In [129]:
# gap between signup and apply (VERY strong churn signal)
df['apply_delay_days'] = (df['Apply Date'] - df['Entry created at']).dt.days

# time behavior
df['apply_month'] = df['Apply Date'].dt.month
df['apply_weekday'] = df['Apply Date'].dt.weekday


In [130]:
df.drop(columns=['Entry created at', 'Apply Date'], inplace=True)


In [131]:
# Handle CATEGORICAL columns
cat_cols = df.select_dtypes(include='object').columns
cat_cols


Index(['Opportunity Name', 'Opportunity Category', 'Gender', 'Country',
       'Institution Name', 'Current/Intended Major', 'continent'],
      dtype='object')

In [132]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8327 entries, 0 to 8540
Columns: 2179 entries, Age_at_Application_Int to continent_South America
dtypes: bool(2171), float64(1), int32(4), int64(3)
memory usage: 17.7 MB


In [135]:
from sklearn.preprocessing import StandardScaler

num_cols = df.select_dtypes(include=['int64','int32','float64']).columns
num_cols = num_cols.drop('churn')

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [136]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 8327 entries, 0 to 8540
Columns: 2179 entries, Age_at_Application_Int to continent_South America
dtypes: bool(2171), float64(8)
memory usage: 17.8 MB


In [137]:
from sklearn.model_selection import train_test_split

X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Train BASELINE model (Logistic Regression)

In [138]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)


In [139]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

         0.0       0.94      0.83      0.88       813
         1.0       0.86      0.95      0.90       853

    accuracy                           0.89      1666
   macro avg       0.90      0.89      0.89      1666
weighted avg       0.90      0.89      0.89      1666

ROC-AUC: 0.9401526195801231


Train STRONG model (Random Forest)

In [140]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    class_weight='balanced',
    random_state=42
)

rf.fit(X_train, y_train)


In [141]:
pred_rf = rf.predict(X_test)
prob_rf = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, prob_rf))


              precision    recall  f1-score   support

         0.0       0.99      0.75      0.85       813
         1.0       0.81      0.99      0.89       853

    accuracy                           0.88      1666
   macro avg       0.90      0.87      0.87      1666
weighted avg       0.90      0.88      0.87      1666

ROC-AUC: 0.9444497317188881


## Gradient Boosting

In [142]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)


In [143]:
gb_pred = gb.predict(X_test)
gb_prob = gb.predict_proba(X_test)[:, 1]

print(classification_report(y_test, gb_pred))
print("GB ROC-AUC:", roc_auc_score(y_test, gb_prob))


              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90       813
         1.0       0.86      0.97      0.92       853

    accuracy                           0.91      1666
   macro avg       0.92      0.91      0.91      1666
weighted avg       0.91      0.91      0.91      1666

GB ROC-AUC: 0.9661631258751041


In [145]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'ROC-AUC': [
        roc_auc_score(y_test, y_prob),
        roc_auc_score(y_test, prob_rf),
        roc_auc_score(y_test, gb_prob)
    ]
})

results


Unnamed: 0,Model,ROC-AUC
0,Logistic Regression,0.940153
1,Random Forest,0.94445
2,Gradient Boosting,0.966163


# “Three models were trained: Logistic Regression, Random Forest, and Gradient Boosting. Gradient Boosting achieved the highest ROC-AUC and recall for churn, and was selected as the final model.”