In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib

In [3]:
# Set random seed for reproducibility
np.random.seed(42)
n_samples = 3000

print("üîß Generating improved training data...")

# Generate base features
data = {
    "crew_id": [f"C{1000+i}" for i in range(n_samples)],
    "season": np.random.choice(["Winter", "Spring", "Summer", "Fall"], n_samples),
    "month": np.random.randint(1, 13, n_samples),
    "days_since_last_sick_leave": np.random.randint(0, 180, n_samples),
    "workload_last_7_days": np.random.randint(10, 70, n_samples),
    "consecutive_duty_days": np.random.randint(1, 14, n_samples),
    "avg_flight_duration_last_week": np.round(np.random.uniform(1.0, 10.0, n_samples), 1),
    "historical_sick_days_count": np.random.randint(0, 15, n_samples),
    "age_group": np.random.choice(["20-30", "31-40", "41-50", "50+"], n_samples),
    "flight_type_ratio": np.round(np.random.uniform(0.0, 1.0, n_samples), 2),
}

df = pd.DataFrame(data)
print(f"‚úÖ Generated {len(df)} samples")
df.head()

üîß Generating improved training data...
‚úÖ Generated 3000 samples


Unnamed: 0,crew_id,season,month,days_since_last_sick_leave,workload_last_7_days,consecutive_duty_days,avg_flight_duration_last_week,historical_sick_days_count,age_group,flight_type_ratio
0,C1000,Summer,2,74,57,12,3.5,2,50+,0.43
1,C1001,Fall,6,107,18,8,9.4,1,20-30,0.41
2,C1002,Winter,12,167,24,13,4.3,7,50+,0.43
3,C1003,Summer,5,42,39,9,8.7,5,20-30,0.97
4,C1004,Summer,10,149,35,10,3.4,6,31-40,0.68


In [4]:
# Calculate stress score (same as before)
df["stress_score"] = (
    0.4 * df["workload_last_7_days"]
    + 0.3 * df["consecutive_duty_days"]
    + 0.3 * df["avg_flight_duration_last_week"]
)

# IMPROVED RISK CALCULATION - More balanced and realistic
# Normalize features to 0-1 scale for better control
workload_norm = df["workload_last_7_days"] / 70  # 70 is max realistic
duty_norm = df["consecutive_duty_days"] / 14  # 14 days max
history_norm = df["historical_sick_days_count"] / 15  # 15 days max
rest_norm = np.clip(1 - (df["days_since_last_sick_leave"] / 180), 0, 1)
stress_norm = df["stress_score"] / 50  # Normalize stress score

# Season impact (Winter = higher risk)
season_risk = df["season"].map({
    "Winter": 0.3,
    "Spring": 0.1, 
    "Summer": 0.0,
    "Fall": 0.2
})

# Age impact (older = slightly higher risk)
age_risk = df["age_group"].map({
    "20-30": 0.0,
    "31-40": 0.05,
    "41-50": 0.1,
    "50+": 0.15
})

# Calculate comprehensive risk score with better weights
risk_score = (
    0.25 * workload_norm +        # 25% weight on workload
    0.25 * duty_norm +             # 25% weight on consecutive days
    0.15 * history_norm +          # 15% weight on sick history
    0.15 * rest_norm +             # 15% weight on recovery time
    0.10 * stress_norm +           # 10% weight on overall stress
    season_risk +                   # Season adjustment
    age_risk                        # Age adjustment
)

# Add some random variation (real life isn't perfectly predictable)
risk_score += np.random.normal(0, 0.05, n_samples)
risk_score = np.clip(risk_score, 0, 1)  # Keep between 0 and 1

# Convert to probability with adjusted sigmoid for better distribution
probability = 1 / (1 + np.exp(-8 * (risk_score - 0.5)))

# Create binary target with threshold
threshold = 0.45  # Balanced threshold
df["sickness_probability"] = (probability > threshold).astype(int)

print(f"\nüìä Class Distribution:")
print(df["sickness_probability"].value_counts())
print(f"\nPositive class rate: {df['sickness_probability'].mean():.2%}")
print(f"\nSample predictions range: {probability.min():.2f} to {probability.max():.2f}")

df[["stress_score", "sickness_probability"]].head(10)


üìä Class Distribution:
1    2586
0     414
Name: sickness_probability, dtype: int64

Positive class rate: 86.20%

Sample predictions range: 0.05 to 0.98


Unnamed: 0,stress_score,sickness_probability
0,27.45,1
1,12.42,1
2,14.79,1
3,20.91,1
4,18.02,1
5,10.48,1
6,16.1,1
7,29.67,1
8,13.15,1
9,23.55,1


In [5]:
# Prepare features and target
X = df.drop(columns=["crew_id", "sickness_probability"])
y = df["sickness_probability"]

# Define preprocessing
categorical_cols = ["season", "age_group"]
numerical_cols = list(X.columns.difference(categorical_cols))

print(f"Numerical features: {numerical_cols}")
print(f"Categorical features: {categorical_cols}")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Create model with improved hyperparameters
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,           # Slightly deeper trees
    min_samples_split=10,   # Prevent overfitting
    min_samples_leaf=5,     # Smooth predictions
    random_state=42,
    class_weight='balanced' # Handle class imbalance
)

# Create pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", model)
    ]
)

print("‚úÖ Pipeline created successfully")

Numerical features: ['avg_flight_duration_last_week', 'consecutive_duty_days', 'days_since_last_sick_leave', 'flight_type_ratio', 'historical_sick_days_count', 'month', 'stress_score', 'workload_last_7_days']
Categorical features: ['season', 'age_group']
‚úÖ Pipeline created successfully


In [6]:
# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Train the model
print("\nüöÄ Training model...")
pipeline.fit(X_train, y_train)
print("‚úÖ Training complete!")

# Make predictions
y_prob = pipeline.predict_proba(X_test)[:, 1]
y_pred = pipeline.predict(X_test)

# Evaluate performance
print("\n" + "="*60)
print("üìà MODEL PERFORMANCE")
print("="*60)
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Sickness', 'Sickness']))

print("\nüìä Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Check prediction distribution
print("\nüìä Prediction Probability Distribution on Test Set:")
prob_bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
prob_labels = ['0-20%', '20-40%', '40-60%', '60-80%', '80-100%']
prob_dist = pd.cut(y_prob, bins=prob_bins, labels=prob_labels)
print(prob_dist.value_counts().sort_index())

Training set: 2400 samples
Test set: 600 samples

üöÄ Training model...
‚úÖ Training complete!

üìà MODEL PERFORMANCE

ROC-AUC Score: 0.9695

üìä Classification Report:
              precision    recall  f1-score   support

 No Sickness       0.67      0.86      0.75        83
    Sickness       0.98      0.93      0.95       517

    accuracy                           0.92       600
   macro avg       0.82      0.89      0.85       600
weighted avg       0.93      0.92      0.93       600


üìä Confusion Matrix:
[[ 71  12]
 [ 35 482]]

üìä Prediction Probability Distribution on Test Set:
0-20%       38
20-40%      44
40-60%      49
60-80%      61
80-100%    408
dtype: int64


In [7]:
# Test with realistic scenarios
print("\nüß™ TESTING WITH SAMPLE SCENARIOS")
print("="*60)

test_cases = [
    {
        "name": "‚úÖ Low Risk Crew (Young, Well-Rested)",
        "data": {
            'season': 'Summer', 'month': 7, 'days_since_last_sick_leave': 150,
            'workload_last_7_days': 20, 'consecutive_duty_days': 3,
            'avg_flight_duration_last_week': 3.0, 'historical_sick_days_count': 1,
            'age_group': '20-30', 'flight_type_ratio': 0.5,
            'stress_score': 0.4 * 20 + 0.3 * 3 + 0.3 * 3.0
        }
    },
    {
        "name": "‚ö†Ô∏è Medium Risk Crew (Moderate Workload)",
        "data": {
            'season': 'Fall', 'month': 10, 'days_since_last_sick_leave': 60,
            'workload_last_7_days': 45, 'consecutive_duty_days': 7,
            'avg_flight_duration_last_week': 6.0, 'historical_sick_days_count': 5,
            'age_group': '31-40', 'flight_type_ratio': 0.7,
            'stress_score': 0.4 * 45 + 0.3 * 7 + 0.3 * 6.0
        }
    },
    {
        "name": "üî¥ High Risk Crew (Overworked, Winter)",
        "data": {
            'season': 'Winter', 'month': 1, 'days_since_last_sick_leave': 10,
            'workload_last_7_days': 65, 'consecutive_duty_days': 12,
            'avg_flight_duration_last_week': 9.0, 'historical_sick_days_count': 12,
            'age_group': '50+', 'flight_type_ratio': 0.9,
            'stress_score': 0.4 * 65 + 0.3 * 12 + 0.3 * 9.0
        }
    },
    {
        "name": "üü° Edge Case (High Hours, Good Rest)",
        "data": {
            'season': 'Spring', 'month': 4, 'days_since_last_sick_leave': 120,
            'workload_last_7_days': 55, 'consecutive_duty_days': 5,
            'avg_flight_duration_last_week': 4.5, 'historical_sick_days_count': 3,
            'age_group': '31-40', 'flight_type_ratio': 0.6,
            'stress_score': 0.4 * 55 + 0.3 * 5 + 0.3 * 4.5
        }
    }
]

for test_case in test_cases:
    test_df = pd.DataFrame([test_case['data']])
    prob = pipeline.predict_proba(test_df)[0][1] * 100
    pred = "At Risk" if prob > 50 else "Normal"
    print(f"\n{test_case['name']}")
    print(f"  Probability: {prob:.1f}% ({pred})")


üß™ TESTING WITH SAMPLE SCENARIOS

‚úÖ Low Risk Crew (Young, Well-Rested)
  Probability: 0.8% (Normal)

‚ö†Ô∏è Medium Risk Crew (Moderate Workload)
  Probability: 98.9% (At Risk)

üî¥ High Risk Crew (Overworked, Winter)
  Probability: 100.0% (At Risk)

üü° Edge Case (High Hours, Good Rest)
  Probability: 91.6% (At Risk)


In [8]:
# Save the trained model
model_path = "crew_sickness_model.pkl"
joblib.dump(pipeline, model_path)

print("\n" + "="*60)
print("‚úÖ MODEL SAVED SUCCESSFULLY!")
print("="*60)
print(f"\nFile: {model_path}")
print("\nüìù Next Steps:")
print("1. Download this .pkl file")
print("2. Replace the old model file in your Streamlit app directory")
print("3. Run your Streamlit app")
print("4. Enjoy varied predictions! üéâ")


‚úÖ MODEL SAVED SUCCESSFULLY!

File: crew_sickness_model.pkl

üìù Next Steps:
1. Download this .pkl file
2. Replace the old model file in your Streamlit app directory
3. Run your Streamlit app
4. Enjoy varied predictions! üéâ
