In [22]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

# === TRAINING PHASE ===

# Load dataset
df = pd.read_csv("C:/Users/ASUS/Desktop/tech/patients.csv")

# Preprocess blood pressure
bp_split = df["blood_pressure"].str.split("/", expand=True)
df["systolic_bp"] = pd.to_numeric(bp_split[0], errors='coerce')
df["diastolic_bp"] = pd.to_numeric(bp_split[1], errors='coerce')
df.drop(columns=["blood_pressure", "patient_id"], inplace=True)

# Encode target
df["readmitted_30_days"] = df["readmitted_30_days"].map({"Yes": 1, "No": 0})

# Features and target
X = df.drop(columns=["readmitted_30_days"])
y = df["readmitted_30_days"]

# One-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Standardize numeric columns
numeric_cols = ["age", "cholesterol", "bmi", "medication_count", "length_of_stay", "systolic_bp", "diastolic_bp"]
scaler = StandardScaler()
X_encoded[numeric_cols] = scaler.fit_transform(X_encoded[numeric_cols])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Handle imbalance using SMOTE
print("Before SMOTE:", y_train.value_counts().to_dict())
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("After SMOTE:", y_train.value_counts().to_dict())

# Train model with class weights
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Evaluate
print("Model evaluation:")
print(classification_report(y_test, model.predict(X_test)))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

# Save model and scaler
joblib.dump(model, "readmission_model.pkl")
joblib.dump(scaler, "scaler.pkl")
required_features = model.feature_names_in_
joblib.dump(required_features, "required_features.pkl")
print("✅ Model and scaler saved")

# === PREDICTION PHASE ===

# Sample input
age = int(input("Enter age: "))
gender = input("Enter gender (Male/Female/Other): ")
bp = input("Enter blood pressure (e.g., 130/80): ")
cholesterol = float(input("Enter cholesterol level: "))
bmi = float(input("Enter BMI: "))
diabetes = input("Diabetes? (Yes/No): ")
hypertension = input("Hypertension? (Yes/No): ")
medication_count = int(input("Number of medications: "))
length_of_stay = int(input("Length of stay (days): "))
discharge_destination = input("Discharge destination (e.g., Home, Nursing_Facility): ")

# Process new input
try:
    systolic, diastolic = map(int, bp.split("/"))
except:
    print("Invalid blood pressure format. Use format like 130/80.")
    exit()

# Assemble input into DataFrame
data = {
    "age": [age],
    "cholesterol": [cholesterol],
    "bmi": [bmi],
    "medication_count": [medication_count],
    "length_of_stay": [length_of_stay],
    "systolic_bp": [systolic],
    "diastolic_bp": [diastolic],
    "gender": [gender],
    "diabetes": [diabetes],
    "hypertension": [hypertension],
    "discharge_destination": [discharge_destination]
}
df_input = pd.DataFrame(data)

# One-hot encode
df_encoded = pd.get_dummies(df_input, columns=["gender", "diabetes", "hypertension", "discharge_destination"], drop_first=True)

# Load model, scaler, features
model = joblib.load("readmission_model.pkl")
scaler = joblib.load("scaler.pkl")
required_features = joblib.load("required_features.pkl")

# Align columns
df_encoded = df_encoded.reindex(columns=required_features, fill_value=0)

# Scale
df_encoded[numeric_cols] = scaler.transform(df_encoded[numeric_cols])

# Predict
risk_score = model.predict_proba(df_encoded)[:, model.classes_.tolist().index(1)][0]
high_risk = risk_score >= 0.5

# Output
print("\n===== RESULT =====")
print(f"Risk Score: {risk_score:.2f}")
print("High Risk: ✅ YES, follow-up needed" if high_risk else "High Risk: ❌ NO, routine care")

# Debug Info
print("\nDebug Info:")
print("- Model classes:", model.classes_)
print("- Raw prediction probabilities:", model.predict_proba(df_encoded)[0])


Before SMOTE: {0: 21095, 1: 2905}
After SMOTE: {0: 21095, 1: 21095}
Model evaluation:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      5231
           1       0.19      0.09      0.12       769

    accuracy                           0.84      6000
   macro avg       0.54      0.52      0.52      6000
weighted avg       0.79      0.84      0.81      6000

ROC AUC: 0.5633247228995691
✅ Model and scaler saved

===== RESULT =====
Risk Score: 0.11
High Risk: ❌ NO, routine care

Debug Info:
- Model classes: [0 1]
- Raw prediction probabilities: [0.89 0.11]
