In [1]:
# Install required packages
!pip install lightgbm xgboost catboost lazypredict scikit-learn scipy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Setup complete!")


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.5.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.5.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow>=2.0.0->lazypredict)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Download

In [3]:
# Load TRAIN datasets
print("üì• Loading train datasets...")
train_patient = pd.read_csv('patient_train.csv')  # Use your actual train filenames
train_risk = pd.read_csv('risk_train.csv')
train_diagnosis = pd.read_csv('diagnosis_train.csv')
train_visit = pd.read_csv('visit_train.csv')
train_care = pd.read_csv('care_train.csv')

# Load TEST datasets
print("üì• Loading test datasets...")
test_patient = pd.read_csv('patient.csv')
test_diagnosis = pd.read_csv('diagnosis.csv')
test_visit = pd.read_csv('visit.csv')
test_care = pd.read_csv('care.csv')

print(f"Train: {len(train_patient)} patients, {len(train_risk)} risk scores")
print(f"Test: {len(test_patient)} patients")
print("‚úÖ Data loaded!")


üì• Loading train datasets...
üì• Loading test datasets...
Train: 8000 patients, 8000 risk scores
Test: 2001 patients
‚úÖ Data loaded!


In [4]:
def fast_feature_engineering(patient_df, diagnosis_df, visit_df, care_df):
    print("üõ†Ô∏è Engineering features...")
    features_df = patient_df.copy()

    # Convert boolean flags
    features_df['hot_spotter_readmission_flag'] = (features_df['hot_spotter_readmission_flag'] == 't').astype(int)
    features_df['hot_spotter_chronic_flag'] = (features_df['hot_spotter_chronic_flag'] == 't').astype(int)

    # Age groups
    features_df['age_group'] = pd.cut(features_df['age'], bins=[0, 30, 50, 70, 100], labels=[1, 2, 3, 4]).astype(int)

    # Diagnosis aggregations
    if len(diagnosis_df) > 0:
        diag_agg = diagnosis_df.groupby('patient_id').agg({
            'condition_name': 'count',
            'is_chronic': lambda x: (x == 't').sum(),
        }).rename(columns={'condition_name': 'total_conditions', 'is_chronic': 'chronic_conditions'})

        # Condition types
        condition_types = diagnosis_df.groupby('patient_id')['condition_name'].apply(
            lambda x: len(set(x))
        ).rename('unique_conditions')
        diag_agg = diag_agg.join(condition_types)
    else:
        diag_agg = pd.DataFrame(index=patient_df['patient_id'])
        diag_agg['total_conditions'] = 0
        diag_agg['chronic_conditions'] = 0
        diag_agg['unique_conditions'] = 0

    # Visit aggregations
    if len(visit_df) > 0:
        visit_agg = visit_df.groupby('patient_id').agg({
            'visit_type': 'count',
            'readmsn_ind': lambda x: (x == 't').sum(),
        }).rename(columns={'visit_type': 'total_visits', 'readmsn_ind': 'readmissions'})

        # Emergency visits
        emergency_visits = visit_df[visit_df['visit_type'].isin(['ER', 'URGENT CARE'])].groupby('patient_id').size().fillna(0)
        visit_agg['emergency_visits'] = emergency_visits
        visit_agg['emergency_ratio'] = visit_agg['emergency_visits'] / (visit_agg['total_visits'] + 1)

        # Visit types
        for vtype in ['ER', 'URGENT CARE', 'INPATIENT']:
            visit_agg[f'{vtype.lower()}_visits'] = visit_df[visit_df['visit_type'] == vtype].groupby('patient_id').size().fillna(0)
    else:
        visit_agg = pd.DataFrame(index=patient_df['patient_id'])
        visit_agg['total_visits'] = 0
        visit_agg['readmissions'] = 0
        visit_agg['emergency_visits'] = 0
        visit_agg['emergency_ratio'] = 0
        visit_agg['er_visits'] = 0
        visit_agg['urgent care_visits'] = 0
        visit_agg['inpatient_visits'] = 0

    # Care aggregations
    if len(care_df) > 0:
        care_agg = care_df.groupby('patient_id').agg({
            'care_gap_ind': lambda x: (x == 't').sum(),
            'msrmnt_value': ['mean', 'max'],
            'msrmnt_type': 'count'
        })
        care_agg.columns = ['care_gaps', 'avg_measurement', 'max_measurement', 'total_care_events']

        # Care gap ratio
        care_agg['care_gap_ratio'] = care_agg['care_gaps'] / (care_agg['total_care_events'] + 1)
    else:
        care_agg = pd.DataFrame(index=patient_df['patient_id'])
        care_agg['care_gaps'] = 0
        care_agg['avg_measurement'] = 0
        care_agg['max_measurement'] = 0
        care_agg['total_care_events'] = 0
        care_agg['care_gap_ratio'] = 0

    # Merge all features
    features_df = features_df.set_index('patient_id')
    features_df = features_df.join(diag_agg, how='left')
    features_df = features_df.join(visit_agg, how='left')
    features_df = features_df.join(care_agg, how='left')

    # Fill missing values
    features_df = features_df.fillna(0)

    # Create interaction features
    features_df['age_chronic_score'] = features_df['age'] * features_df['chronic_conditions']
    features_df['visit_care_ratio'] = features_df['total_visits'] / (features_df['total_care_events'] + 1)
    features_df['readmission_rate'] = features_df['readmissions'] / (features_df['total_visits'] + 1)
    features_df['chronic_burden'] = features_df['chronic_conditions'] / (features_df['total_conditions'] + 1)

    features_df = features_df.reset_index()
    print(f"‚úÖ Created {len(features_df.columns)-1} features")
    return features_df

# Generate features for train and test
X_train_feat = fast_feature_engineering(train_patient, train_diagnosis, train_visit, train_care)
X_test_feat = fast_feature_engineering(test_patient, test_diagnosis, test_visit, test_care)


üõ†Ô∏è Engineering features...
‚úÖ Created 24 features
üõ†Ô∏è Engineering features...
‚úÖ Created 24 features


In [5]:
def create_text_features(train_diagnosis, test_diagnosis, train_visit, test_visit):
    print("üìù Processing text features...")

    # Combine diagnosis text per patient
    def get_patient_text(diagnosis_df, visit_df, patient_ids):
        patient_texts = []
        for pid in patient_ids:
            texts = []

            # Diagnosis descriptions
            if len(diagnosis_df) > 0:
                patient_diag = diagnosis_df[diagnosis_df['patient_id'] == pid]
                if not patient_diag.empty:
                    diag_text = ' '.join(patient_diag['condition_description'].fillna('').astype(str))
                    texts.append(diag_text)

            # Visit diagnoses
            if len(visit_df) > 0:
                patient_visits = visit_df[visit_df['patient_id'] == pid]
                if not patient_visits.empty:
                    visit_text = ' '.join(patient_visits['prncpl_diag_nm'].fillna('').astype(str))
                    texts.append(visit_text)

            combined = ' '.join(texts) if texts else ''
            patient_texts.append(combined)

        return patient_texts

    # Get text for all patients
    train_texts = get_patient_text(train_diagnosis, train_visit, train_patient['patient_id'])
    test_texts = get_patient_text(test_diagnosis, test_visit, test_patient['patient_id'])

    # TF-IDF vectorization
    tfidf = TfidfVectorizer(
        max_features=1500,
        ngram_range=(1, 2),
        stop_words='english',
        min_df=2,
        max_df=0.95
    )

    X_text_train = tfidf.fit_transform(train_texts)
    X_text_test = tfidf.transform(test_texts)

    print(f"‚úÖ Created {X_text_train.shape[1]} text features")
    return X_text_train, X_text_test

# Create text features
X_text_train, X_text_test = create_text_features(train_diagnosis, test_diagnosis, train_visit, test_visit)


üìù Processing text features...
‚úÖ Created 1500 text features


In [8]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

# Prepare numerical features - CONVERT OBJECT TYPES TO NUMERIC
feature_cols = [col for col in X_train_feat.columns if col not in ['patient_id']]
print("üîß Fixing data types...")

# Check and fix data types
X_train_df = X_train_feat[feature_cols].copy()
X_test_df = X_test_feat[feature_cols].copy()

# Convert object columns to numeric
label_encoders = {}
for col in X_train_df.columns:
    if X_train_df[col].dtype == 'object':
        print(f"Converting {col} from object to numeric")
        le = LabelEncoder()
        # Combine train and test for consistent encoding
        combined_values = pd.concat([X_train_df[col], X_test_df[col]]).astype(str)
        le.fit(combined_values)

        X_train_df[col] = le.transform(X_train_df[col].astype(str))
        X_test_df[col] = le.transform(X_test_df[col].astype(str))
        label_encoders[col] = le

# Convert to float64 (required for scipy.sparse)
X_train_df = X_train_df.astype('float64')
X_test_df = X_test_df.astype('float64')

X_train_num = X_train_df.values
X_test_num = X_test_df.values

print(f"Numerical features: {X_train_num.shape[1]}")
print(f"Text features: {X_text_train.shape[1]}")

# Convert numerical arrays to sparse format for consistency
X_train_num_sparse = csr_matrix(X_train_num)
X_test_num_sparse = csr_matrix(X_test_num)

# Combine numerical and text features
X_train_combined = hstack([X_train_num_sparse, X_text_train])
X_test_combined = hstack([X_test_num_sparse, X_text_test])

# Prepare target variable
y_train = train_risk.set_index('patient_id').loc[X_train_feat['patient_id'], 'risk_score'].values

print(f"Final feature matrix: {X_train_combined.shape}")
print(f"Target statistics: Mean={y_train.mean():.2f}, Std={y_train.std():.2f}")
print("‚úÖ Data types fixed and features combined!")


üîß Fixing data types...
Converting hot_spotter_identified_at from object to numeric
Numerical features: 24
Text features: 1500
Final feature matrix: (8000, 1524)
Target statistics: Mean=1.68, Std=2.52
‚úÖ Data types fixed and features combined!


In [10]:
# Quick model comparison with LazyPredict
print("üöÄ Running LazyPredict baseline...")

# Get number of samples (fix for sparse array)
n_samples = min(2000, X_train_combined.shape[0])
indices = np.random.choice(X_train_combined.shape[0], n_samples, replace=False)

# Convert to dense if sparse
if hasattr(X_train_combined, 'toarray'):
    X_sample = X_train_combined[indices].toarray()
else:
    X_sample = X_train_combined[indices]

y_sample = y_train[indices]

X_tr, X_val, y_tr, y_val = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# LazyPredict
try:
    lazy_reg = LazyRegressor(verbose=0, ignore_warnings=True, random_state=42)
    models_comparison, predictions = lazy_reg.fit(X_tr, X_val, y_tr, y_val)
    print("üìä LazyPredict Results:")
    print(models_comparison.head(10))
except Exception as e:
    print(f"‚ö†Ô∏è LazyPredict failed: {e}")
    print("‚è© Skipping to main models...")


üöÄ Running LazyPredict baseline...


  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2030
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 95
[LightGBM] [Info] Start training from score 1.740950
üìä LazyPredict Results:
                                         Adjusted R-Squared  \
Model                                                         
Lars                       16609349595769328934250872832.00   
SGDRegressor                     52724867542591345786880.00   
GaussianProcessRegressor                             817.00   
RANSACRegressor                                        1.71   
AdaBoostRegressor                                      1.66   
OrthogonalMatchingPursuit                              1.64   
LinearSVR                                              1.52   
KernelRidge                                            1.49   
Lin

In [11]:
def train_gpu_models(X_train, y_train, X_test):
    print("üî• Training GPU-accelerated models...")

    # Convert sparse to dense if needed
    if hasattr(X_train, 'toarray'):
        X_train_dense = X_train.toarray()
        X_test_dense = X_test.toarray()
        print("Converted sparse to dense arrays")
    else:
        X_train_dense = X_train
        X_test_dense = X_test

    models = {}
    predictions = {}

    # 1. LightGBM (handles sparse and dense)
    print("Training LightGBM...")
    try:
        lgb_model = LGBMRegressor(
            n_estimators=800,
            learning_rate=0.08,
            num_leaves=31,
            max_depth=8,
            min_child_samples=20,
            subsample=0.8,
            colsample_bytree=0.8,
            device='gpu' if hasattr(X_train, 'toarray') else 'cpu',  # Auto-detect
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        lgb_model.fit(X_train, y_train)
        models['lgb'] = lgb_model
        predictions['lgb'] = lgb_model.predict(X_test)
        print("‚úÖ LightGBM trained successfully")
    except Exception as e:
        print(f"‚ùå LightGBM failed: {e}")
        # Fallback to CPU
        lgb_model = LGBMRegressor(
            n_estimators=500,
            learning_rate=0.1,
            num_leaves=31,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        lgb_model.fit(X_train_dense, y_train)
        models['lgb'] = lgb_model
        predictions['lgb'] = lgb_model.predict(X_test_dense)
        print("‚úÖ LightGBM trained on CPU")

    # 2. XGBoost (GPU)
    print("Training XGBoost...")
    try:
        xgb_model = XGBRegressor(
            n_estimators=800,
            learning_rate=0.08,
            max_depth=6,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='gpu_hist',
            gpu_id=0,
            random_state=42,
            n_jobs=-1
        )
        xgb_model.fit(X_train_dense, y_train)
        models['xgb'] = xgb_model
        predictions['xgb'] = xgb_model.predict(X_test_dense)
        print("‚úÖ XGBoost trained on GPU")
    except Exception as e:
        print(f"‚ùå XGBoost GPU failed: {e}")
        # Fallback to CPU
        xgb_model = XGBRegressor(
            n_estimators=500,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=-1
        )
        xgb_model.fit(X_train_dense, y_train)
        models['xgb'] = xgb_model
        predictions['xgb'] = xgb_model.predict(X_test_dense)
        print("‚úÖ XGBoost trained on CPU")

    # 3. CatBoost (GPU)
    print("Training CatBoost...")
    try:
        cat_model = CatBoostRegressor(
            iterations=800,
            learning_rate=0.08,
            depth=6,
            l2_leaf_reg=3,
            task_type='GPU',
            devices='0',
            random_seed=42,
            verbose=False
        )
        cat_model.fit(X_train_dense, y_train)
        models['cat'] = cat_model
        predictions['cat'] = cat_model.predict(X_test_dense)
        print("‚úÖ CatBoost trained on GPU")
    except Exception as e:
        print(f"‚ùå CatBoost GPU failed: {e}")
        # Fallback to CPU
        cat_model = CatBoostRegressor(
            iterations=500,
            learning_rate=0.1,
            depth=6,
            random_seed=42,
            verbose=False
        )
        cat_model.fit(X_train_dense, y_train)
        models['cat'] = cat_model
        predictions['cat'] = cat_model.predict(X_test_dense)
        print("‚úÖ CatBoost trained on CPU")

    return models, predictions

# Train models
models, predictions = train_gpu_models(X_train_combined, y_train, X_test_combined)


üî• Training GPU-accelerated models...
Converted sparse to dense arrays
Training LightGBM...
‚ùå LightGBM failed: No OpenCL device found
‚úÖ LightGBM trained on CPU
Training XGBoost...
‚ùå XGBoost GPU failed: [16:40:54] /workspace/src/context.cc:242: `gpu_id` has been removed since 3.1. Use `device` instead.
Stack trace:
  [bt] (0) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x2bdf8c) [0x7f82e5cbdf8c]
  [bt] (1) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x4ac700) [0x7f82e5eac700]
  [bt] (2) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x6ee464) [0x7f82e60ee464]
  [bt] (3) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x6f4777) [0x7f82e60f4777]
  [bt] (4) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x77) [0x7f82e5bcaa67]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7f83407abe2e]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7f834

In [12]:
# Create ensemble predictions
print("üéØ Creating ensemble...")

# Weights based on typical performance
weights = {'lgb': 0.4, 'xgb': 0.35, 'cat': 0.25}

ensemble_pred = sum(weights[name] * pred for name, pred in predictions.items())

# Ensure positive predictions
ensemble_pred = np.maximum(ensemble_pred, 0.1)

# Validation split for performance check
X_tr, X_val, y_tr, y_val = train_test_split(X_train_combined, y_train, test_size=0.2, random_state=42)

# Convert to dense for validation
if hasattr(X_tr, 'toarray'):
    X_tr_dense = X_tr.toarray()
    X_val_dense = X_val.toarray()
else:
    X_tr_dense = X_tr
    X_val_dense = X_val

# Quick validation models
print("üîç Validating models...")
val_predictions = {}

for name, model in models.items():
    # Create new instance with same parameters
    if name == 'lgb':
        val_model = LGBMRegressor(n_estimators=300, learning_rate=0.1, random_state=42, verbose=-1)
    elif name == 'xgb':
        val_model = XGBRegressor(n_estimators=300, learning_rate=0.1, random_state=42)
    else:  # catboost
        val_model = CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=42, verbose=False)

    val_model.fit(X_tr_dense, y_tr)
    val_pred = val_model.predict(X_val_dense)
    val_predictions[name] = val_pred

val_ensemble = sum(weights[name] * pred for name, pred in val_predictions.items())
val_mae = mean_absolute_error(y_val, val_ensemble)
val_r2 = r2_score(y_val, val_ensemble)

print(f"üéØ Validation Results:")
print(f"Ensemble - MAE: {val_mae:.4f}, R¬≤: {val_r2:.4f}")

# Individual model performance
for name, pred in val_predictions.items():
    mae = mean_absolute_error(y_val, pred)
    r2 = r2_score(y_val, pred)
    print(f"{name.upper()} - MAE: {mae:.4f}, R¬≤: {r2:.4f}")


üéØ Creating ensemble...
üîç Validating models...
üéØ Validation Results:
Ensemble - MAE: 0.8292, R¬≤: 0.4583
LGB - MAE: 0.8658, R¬≤: 0.4334
XGB - MAE: 0.8470, R¬≤: 0.3934
CAT - MAE: 0.8389, R¬≤: 0.4303


In [13]:
# Create submission file
print("üíæ Creating submission...")

submission = pd.DataFrame({
    'patient_id': test_patient['patient_id'],
    'predicted_risk_score': ensemble_pred
})

# Quality checks
print(f"Predictions summary:")
print(f"Min: {ensemble_pred.min():.4f}")
print(f"Max: {ensemble_pred.max():.4f}")
print(f"Mean: {ensemble_pred.mean():.4f}")
print(f"Std: {ensemble_pred.std():.4f}")

# Save submission
submission.to_csv('Prediction.csv', index=False)

print("‚úÖ Prediction.csv saved successfully!")
print(f"üìä Submission shape: {submission.shape}")
print(submission.head())


üíæ Creating submission...
Predictions summary:
Min: 0.1000
Max: 20.3307
Mean: 1.6945
Std: 1.8947
‚úÖ Prediction.csv saved successfully!
üìä Submission shape: (2001, 2)
   patient_id  predicted_risk_score
0         276                  0.58
1         309                  0.56
2         327                  0.75
3         333                  2.97
4         344                  1.41


In [14]:
# Create submission file
print("üíæ Creating submission...")

submission = pd.DataFrame({
    'patient_id': test_patient['patient_id'],
    'predicted_risk_score': ensemble_pred
})

# Quality checks
print(f"Predictions summary:")
print(f"Min: {ensemble_pred.min():.4f}")
print(f"Max: {ensemble_pred.max():.4f}")
print(f"Mean: {ensemble_pred.mean():.4f}")
print(f"Std: {ensemble_pred.std():.4f}")

# Save submission
submission.to_csv('Prediction.csv', index=False)

print("‚úÖ Prediction.csv saved successfully!")
print(f"üìä Submission shape: {submission.shape}")
print(submission.head())


üíæ Creating submission...
Predictions summary:
Min: 0.1000
Max: 20.3307
Mean: 1.6945
Std: 1.8947
‚úÖ Prediction.csv saved successfully!
üìä Submission shape: (2001, 2)
   patient_id  predicted_risk_score
0         276                  0.58
1         309                  0.56
2         327                  0.75
3         333                  2.97
4         344                  1.41
