In [None]:
# ====================================================================
# ML MODEL TRAINING
# DNA Gene Mapping Project
# Author: Sharique Mohammad
# Date: 12 January 2026
# ====================================================================

In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import os
from pathlib import Path
import joblib
import warnings

warnings.filterwarnings('ignore')

load_dotenv()

print("Libraries imported successfully")

In [None]:
# Setup Paths

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'jupyter_notebooks' else Path.cwd()
ML_DIR = PROJECT_ROOT / "data" / "ml"
MODELS_DIR = PROJECT_ROOT / "models"
VIZ_DIR = PROJECT_ROOT / "data" / "analytical" / "visualizations"

ML_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
VIZ_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"ML directory: {ML_DIR}")
print(f"Models directory: {MODELS_DIR}")

In [None]:
# Database Connection & Load Data

DB_CONFIG = {
    'host': os.getenv('POSTGRES_HOST', 'localhost'),
    'port': int(os.getenv('POSTGRES_PORT', 5432)),
    'database': os.getenv('POSTGRES_DATABASE', 'genome_db'),
    'user': os.getenv('POSTGRES_USER', 'postgres'),
    'password': os.getenv('POSTGRES_PASSWORD')
}

engine = create_engine(
    f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}"
    f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

print("Loading ML features from PostgreSQL...")
df_ml = pd.read_sql("SELECT * FROM gold.ml_features", engine)
print(f"Loaded {len(df_ml)} genes with ML features")

In [None]:
# Data Exploration

print("\n" + "="*40)
print("ML FEATURES OVERVIEW")
print("="*40)
print(df_ml.head())
print("\nShape:", df_ml.shape)
print("\nColumns:", df_ml.columns.tolist())
print("\nData types:")
print(df_ml.dtypes)
print("\nTarget variable distribution:")
print(df_ml['risk_level'].value_counts())

In [None]:
# Feature Engineering & Preparation

print("\n" + "="*40)
print("FEATURE ENGINEERING")
print("="*40)

# Remove genes with NULL chromosome
df_ml_clean = df_ml[df_ml['chromosome'].notna()].copy()
print(f"Removed {len(df_ml) - len(df_ml_clean)} genes with NULL chromosome")

# Create additional features
df_ml_clean['benign_ratio'] = df_ml_clean['total_benign'] / df_ml_clean['mutation_count']
df_ml_clean['pathogenic_to_benign'] = df_ml_clean['total_pathogenic'] / (df_ml_clean['total_benign'] + 1)
df_ml_clean['has_diseases'] = (df_ml_clean['disease_count'] > 0).astype(int)
df_ml_clean['high_mutation_density'] = (df_ml_clean['mutation_density'] > df_ml_clean['mutation_density'].median()).astype(int)

print("Created additional features:")
print("  - benign_ratio")
print("  - pathogenic_to_benign")
print("  - has_diseases")
print("  - high_mutation_density")

# One-hot encode chromosome
df_encoded = pd.get_dummies(df_ml_clean, columns=['chromosome'], prefix='chr', drop_first=True)
print(f"\nOne-hot encoded chromosome (created {len([c for c in df_encoded.columns if c.startswith('chr')])} features)")

In [None]:
# CELL 6: Prepare Features and Target

print("\n" + "="*40)
print("PREPARING FEATURES AND TARGET")
print("="*40)

# Define feature columns (exclude target and identifiers)
exclude_cols = ['gene_name', 'risk_level', 'created_at']
feature_cols = [col for col in df_encoded.columns if col not in exclude_cols]

X = df_encoded[feature_cols].copy()
y = df_encoded['risk_level'].copy()

print(f"Features: {len(feature_cols)} columns")
print(f"Target: risk_level")
print(f"Samples: {len(X)}")

# Handle missing values
X = X.fillna(X.median())

print("\nFeature columns:")
for i, col in enumerate(feature_cols[:10], 1):
    print(f"  {i}. {col}")
if len(feature_cols) > 10:
    print(f"  ... and {len(feature_cols) - 10} more")

print("\nTarget distribution:")
print(y.value_counts())
print("\nTarget proportions:")
print(y.value_counts(normalize=True))

In [None]:
# CELL 7: Train/Val/Test Split

print("\n" + "="*40)
print("TRAIN/VAL/TEST SPLIT")
print("="*40)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Encoded classes: {le.classes_}")

# Split: 70% train, 15% val, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val set:   {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

print("\nTrain set class distribution:")
train_dist = pd.Series(y_train).value_counts()
for i, count in train_dist.items():
    print(f"  {le.classes_[i]}: {count} ({count/len(y_train)*100:.1f}%)")

In [None]:
# CELL 8: Feature Scaling

print("\n" + "="*40)
print("FEATURE SCALING")
print("="*40)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Applied StandardScaler to all features")
print(f"Feature mean (after scaling): {X_train_scaled.mean():.4f}")
print(f"Feature std (after scaling): {X_train_scaled.std():.4f}")

# Save scaler
joblib.dump(scaler, MODELS_DIR / 'scaler.pkl')
joblib.dump(le, MODELS_DIR / 'label_encoder.pkl')
print(f"\nSaved scaler to: {MODELS_DIR / 'scaler.pkl'}")
print(f"Saved label encoder to: {MODELS_DIR / 'label_encoder.pkl'}")

In [None]:
# Model 1 - Logistic Regression (Baseline)

print("\n" + "="*40)
print("MODEL 1: LOGISTIC REGRESSION (BASELINE)")
print("="*40)

lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
lr_model.fit(X_train_scaled, y_train)

lr_train_pred = lr_model.predict(X_train_scaled)
lr_val_pred = lr_model.predict(X_val_scaled)

lr_train_acc = accuracy_score(y_train, lr_train_pred)
lr_val_acc = accuracy_score(y_val, lr_val_pred)
lr_train_f1 = f1_score(y_train, lr_train_pred, average='weighted')
lr_val_f1 = f1_score(y_val, lr_val_pred, average='weighted')

print(f"Train Accuracy: {lr_train_acc:.4f}")
print(f"Train F1-Score: {lr_train_f1:.4f}")
print(f"Val Accuracy:   {lr_val_acc:.4f}")
print(f"Val F1-Score:   {lr_val_f1:.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, lr_val_pred, target_names=le.classes_))

In [None]:
# Model 2 - Random Forest

print("\n" + "="*40)
print("MODEL 2: RANDOM FOREST")
print("="*40)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)

rf_train_acc = accuracy_score(y_train, rf_train_pred)
rf_val_acc = accuracy_score(y_val, rf_val_pred)
rf_train_f1 = f1_score(y_train, rf_train_pred, average='weighted')
rf_val_f1 = f1_score(y_val, rf_val_pred, average='weighted')

print(f"Train Accuracy: {rf_train_acc:.4f}")
print(f"Train F1-Score: {rf_train_f1:.4f}")
print(f"Val Accuracy:   {rf_val_acc:.4f}")
print(f"Val F1-Score:   {rf_val_f1:.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, rf_val_pred, target_names=le.classes_))

In [None]:
# Model 3 - XGBoost

print("\n" + "="*40)
print("MODEL 3: XGBOOST")
print("="*40)

xgb_model = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

xgb_train_pred = xgb_model.predict(X_train)
xgb_val_pred = xgb_model.predict(X_val)

xgb_train_acc = accuracy_score(y_train, xgb_train_pred)
xgb_val_acc = accuracy_score(y_val, xgb_val_pred)
xgb_train_f1 = f1_score(y_train, xgb_train_pred, average='weighted')
xgb_val_f1 = f1_score(y_val, xgb_val_pred, average='weighted')

print(f"Train Accuracy: {xgb_train_acc:.4f}")
print(f"Train F1-Score: {xgb_train_f1:.4f}")
print(f"Val Accuracy:   {xgb_val_acc:.4f}")
print(f"Val F1-Score:   {xgb_val_f1:.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, xgb_val_pred, target_names=le.classes_))

In [None]:
# Model 4 - LightGBM

print("\n" + "="*40)
print("MODEL 4: LIGHTGBM")
print("="*40)

lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)
lgbm_model.fit(X_train, y_train)

lgbm_train_pred = lgbm_model.predict(X_train)
lgbm_val_pred = lgbm_model.predict(X_val)

lgbm_train_acc = accuracy_score(y_train, lgbm_train_pred)
lgbm_val_acc = accuracy_score(y_val, lgbm_val_pred)
lgbm_train_f1 = f1_score(y_train, lgbm_train_pred, average='weighted')
lgbm_val_f1 = f1_score(y_val, lgbm_val_pred, average='weighted')

print(f"Train Accuracy: {lgbm_train_acc:.4f}")
print(f"Train F1-Score: {lgbm_train_f1:.4f}")
print(f"Val Accuracy:   {lgbm_val_acc:.4f}")
print(f"Val F1-Score:   {lgbm_val_f1:.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, lgbm_val_pred, target_names=le.classes_))

In [None]:
# Model Comparison

print("\n" + "="*40)
print("MODEL COMPARISON")
print("="*40)

comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM'],
    'Train_Accuracy': [lr_train_acc, rf_train_acc, xgb_train_acc, lgbm_train_acc],
    'Val_Accuracy': [lr_val_acc, rf_val_acc, xgb_val_acc, lgbm_val_acc],
    'Train_F1': [lr_train_f1, rf_train_f1, xgb_train_f1, lgbm_train_f1],
    'Val_F1': [lr_val_f1, rf_val_f1, xgb_val_f1, lgbm_val_f1],
    'Overfit_Gap': [
        lr_train_f1 - lr_val_f1,
        rf_train_f1 - rf_val_f1,
        xgb_train_f1 - xgb_val_f1,
        lgbm_train_f1 - lgbm_val_f1
    ]
})

comparison_df = comparison_df.sort_values('Val_F1', ascending=False)
print("\nModel Performance Comparison:")
print(comparison_df.to_string(index=False))

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy comparison
comparison_df.plot(x='Model', y=['Train_Accuracy', 'Val_Accuracy'], 
                   kind='bar', ax=axes[0], rot=45)
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_ylim([0, 1])
axes[0].legend(['Train', 'Validation'])
axes[0].grid(axis='y', alpha=0.3)

# F1-Score comparison
comparison_df.plot(x='Model', y=['Train_F1', 'Val_F1'], 
                   kind='bar', ax=axes[1], rot=45, color=['green', 'orange'])
axes[1].set_title('Model F1-Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1-Score (Weighted)')
axes[1].set_ylim([0, 1])
axes[1].legend(['Train', 'Validation'])
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(VIZ_DIR / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Select best model
best_model_name = comparison_df.iloc[0]['Model']
best_val_f1 = comparison_df.iloc[0]['Val_F1']
print(f"\nBest Model: {best_model_name}")
print(f"Best Val F1-Score: {best_val_f1:.4f}")

# Map to actual model object
model_map = {
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model
}
best_model = model_map[best_model_name]

In [None]:
# Hyperparameter Tuning (Best Model)

print("\n" + "="*40)
print(f"HYPERPARAMETER TUNING: {best_model_name}")
print("="*40)

if best_model_name == 'LightGBM':
    param_grid = {
        'num_leaves': [20, 31, 50],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 7, 10],
        'min_child_samples': [10, 20, 30]
    }
    base_model = LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1)
    
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    
elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5]
    }
    base_model = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
else:
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'saga']
    }
    base_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')

print("Starting RandomizedSearchCV...")
print(f"Parameter grid: {len(list(param_grid.values())[0]) ** len(param_grid)} combinations")

random_search = RandomizedSearchCV(
    base_model,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='f1_weighted',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Use scaled data for Logistic Regression, raw for tree-based
if best_model_name == 'Logistic Regression':
    random_search.fit(X_train_scaled, y_train)
else:
    random_search.fit(X_train, y_train)

print("\nBest parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest CV F1-Score: {random_search.best_score_:.4f}")

final_model = random_search.best_estimator_

In [None]:
# Final Model Evaluation

print("\n" + "="*40)
print("FINAL MODEL EVALUATION")
print("="*40)

# Predictions on all sets
if best_model_name == 'Logistic Regression':
    final_train_pred = final_model.predict(X_train_scaled)
    final_val_pred = final_model.predict(X_val_scaled)
    final_test_pred = final_model.predict(X_test_scaled)
else:
    final_train_pred = final_model.predict(X_train)
    final_val_pred = final_model.predict(X_val)
    final_test_pred = final_model.predict(X_test)

final_train_f1 = f1_score(y_train, final_train_pred, average='weighted')
final_val_f1 = f1_score(y_val, final_val_pred, average='weighted')
final_test_f1 = f1_score(y_test, final_test_pred, average='weighted')

print(f"Train F1-Score: {final_train_f1:.4f}")
print(f"Val F1-Score:   {final_val_f1:.4f}")
print(f"Test F1-Score:  {final_test_f1:.4f}")

print("\nTest Set Classification Report:")
print(classification_report(y_test, final_test_pred, target_names=le.classes_))

In [None]:
# Save Final Model

print("\n" + "="*40)
print("SAVING FINAL MODEL")
print("="*40)

# Save model
joblib.dump(final_model, MODELS_DIR / 'best_model.pkl')
print(f"Saved model to: {MODELS_DIR / 'best_model.pkl'}")

# Save feature names
feature_names = {'features': feature_cols}
pd.DataFrame([feature_names]).to_json(MODELS_DIR / 'feature_names.json', orient='records')
print(f"Saved feature names to: {MODELS_DIR / 'feature_names.json'}")

# Save model info
model_info = {
    'model_name': best_model_name,
    'train_f1': final_train_f1,
    'val_f1': final_val_f1,
    'test_f1': final_test_f1,
    'best_params': random_search.best_params_,
    'n_features': len(feature_cols),
    'n_samples': len(X),
    'classes': le.classes_.tolist()
}

pd.DataFrame([model_info]).to_json(MODELS_DIR / 'model_info.json', orient='records', indent=2)
print(f"Saved model info to: {MODELS_DIR / 'model_info.json'}")

print("\n" + "="*40)
print("ML MODEL TRAINING COMPLETE!")
print("="*40)
print(f"Best Model: {best_model_name}")
print(f"Test F1-Score: {final_test_f1:.4f}")
print(f"Model saved to: {MODELS_DIR}")
print("="*40)