In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [None]:
data = pd.read_csv("patient_data.csv")
print(data.head())
print(data.info())


In [None]:
# Check for null values
print(data.isnull().sum())

# Optionally fill or drop
data.fillna(data.mean(numeric_only=True), inplace=True)

# Encode binary categorical features (e.g., smoking, alcohol_intake)
label_enc = LabelEncoder()
data['smoking'] = label_enc.fit_transform(data['smoking'])        # Yes/No → 1/0
data['alcohol_intake'] = label_enc.fit_transform(data['alcohol_intake'])
data['family_history'] = label_enc.fit_transform(data['family_history'])


In [None]:
scaler = StandardScaler()

features = data.drop('disease', axis=1)
scaled_features = scaler.fit_transform(features)

X = pd.DataFrame(scaled_features, columns=features.columns)
y = data['disease']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)


In [None]:
models = {
    "Logistic Regression": y_pred_log,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "SVM": y_pred_svm,
    "KNN": y_pred_knn
}

for name, y_pred in models.items():
    print(f"\n{name} Evaluation:")
    print(f"Accuracy : {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall   : {recall_score(y_test, y_pred):.2f}")
    print(f"F1 Score : {f1_score(y_test, y_pred):.2f}")
    print("-" * 40)


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

for name, y_pred in models.items():
    print(f"\n{name} - Confusion Matrix")
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.title(f"{name} Confusion Matrix")
    plt.show()


In [None]:
import joblib

joblib.dump(best_rf, 'disease_prediction_model.pkl')
