In [1]:
# genetic_disease_model.py (converted from .ipynb)

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Load dataset
df = pd.read_csv("/home/sanglap/Documents/Project/my_project/archive/train_genetic_disorders.csv")

# Drop rows with missing values in important columns
df = df.dropna(subset=["Genetic Disorder"])

# Create a risk level target based on disorder type
risk_mapping = {
    'Mitochondrial genetic inheritance disorders': 'High',
    'Single-gene inheritance disorders': 'Moderate',
    'Multifactorial genetic inheritance disorders': 'Low'
}

# Map the disorder to risk level
df['Risk Level'] = df['Genetic Disorder'].map(risk_mapping)

# Drop rows with unmapped risk
df = df.dropna(subset=['Risk Level'])

# Define features
features = [
    "Mother's age", "Father's age", 'Blood test result', 'Birth defects', 'Gender',
    "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'White Blood cell count (thousand per microliter)', 'Blood cell count (mcL)',
    'Folic acid details (peri-conceptional)', 'H/O serious maternal illness',
    'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'No. of previous abortion'
]

# Filter valid rows
df = df.dropna(subset=features)

# One-hot encode and prepare data
X = pd.get_dummies(df[features])
y = df['Risk Level']

# Save model features
joblib.dump(X.columns, "model_features.pkl")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
preds = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
pred_confidence = [max(prob) for prob in preds_proba]
acc = accuracy_score(y_test, preds)
print(f"Validation Accuracy: {acc*100:.2f}%")

# Save model
joblib.dump(model, "genetic_risk_model.pkl")


Validation Accuracy: 83.31%


['genetic_risk_model.pkl']