<a href="https://colab.research.google.com/github/smruthyunjaya05/MLE/blob/main/Healthcare_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
import warnings
warnings.filterwarnings('ignore')



In [None]:
df = pd.read_csv("/content/Healthcare-Diabetes.csv")


In [None]:
print("Shape:", df.shape)
print("Missing values:", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())

if 'Id' in df.columns:
    df.drop(columns='Id', inplace=True)

display(df.head())
display(df.describe().T)

df.hist(figsize=(15, 10), bins=20)
plt.show()

sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

sns.countplot(x='Pregnancies', data=df, hue='Outcome')
plt.show()

pairs = [
    ("Age", "BloodPressure"),
    ("Age", "Glucose"),
    ("Age", "SkinThickness"),
    ("BloodPressure", "BMI"),
    ("Insulin", "Glucose"),
    ("Pregnancies", "Glucose"),
    ("Pregnancies", "Insulin"),
    ("Pregnancies", "SkinThickness"),
    ("Pregnancies", "BMI"),
    ("Pregnancies", "BloodPressure"),
    ("Pregnancies", "DiabetesPedigreeFunction"),
    ("DiabetesPedigreeFunction", "Insulin"),
    ("DiabetesPedigreeFunction", "SkinThickness"),
    ("DiabetesPedigreeFunction", "BMI"),
    ("DiabetesPedigreeFunction", "BloodPressure"),
    ("DiabetesPedigreeFunction", "Glucose"),
    ("DiabetesPedigreeFunction", "Age")
]

# Scatter plots
for x, y in pairs:
    sns.scatterplot(data=df, x=x, y=y, hue='Outcome')
    plt.show()


In [None]:
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("Healthcare-Diabetes.csv")
df.drop(columns='Id', errors='ignore', inplace=True)

zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_cols] = df[zero_cols].replace(0, np.nan).fillna(df[zero_cols].median())

df['AgeGroup'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60, 70, 80], labels=['21-30', '31-40', '41-50', '51-60', '61-70', '71-80'])
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)

df['BMI_Glucose'] = df['BMI'] * df['Glucose']

scale_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'BMI_Glucose']
df[scale_cols] = StandardScaler().fit_transform(df[scale_cols])

df.head()


In [None]:
!pip install -q xgboost lightgbm

import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

df = pd.read_csv("Healthcare-Diabetes.csv")
df.drop(columns='Id', inplace=True)
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[zero_cols] = df[zero_cols].replace(0, np.nan).fillna(df[zero_cols].median())

X = df.drop('Outcome', axis=1)
y = df['Outcome']
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Outlier removal using IQR
Q1, Q3 = X.quantile(0.25), X.quantile(0.75)
IQR = Q3 - Q1
mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
X, y = X[mask], y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    results[name] = accuracy_score(y_test, model.predict(X_test))

for model, acc in results.items():
    print(f"{model}: {acc:.4f}")
