In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd

# Replace 'your_file.csv' with your actual file path
df = pd.read_csv('./health.csv')

# Display the first 5 rows to verify the data was loaded correctly
df.head()

Unnamed: 0,patientid,healthassessmentid,assessment_date,n_assessment,assessment_number,final_assessment,assessment_year,assessment_type,assessment_quarter,assessment_month,...,imd_idaopi_score,glucose,tot_cholesterol,alcohol_consumption,regular_exercise,t2dm_nice,prediabetic,first_to_last,days_between_ha,years_between_ha
0,0x0001EC929E1AB41684A7B5528628E4B7,0x97B5EB4313702E21828DD59E5BC4F085,13oct2017,1,1,1,2017,1,4,10,...,0.024,3.8,4.68,1,0,0,0,,,
1,0x0004022CC26B79C37471E953F521D0DC,0x786B53CCD0FF6397100F4080465628E0,01may2018,1,1,1,2018,4,2,5,...,0.11,5.3,6.2,1,1,0,0,,,
2,0x00044057F135CC6526BE752AD83115F6,0x6B04800E0E852F8C997D9AD8019B868E,13apr2018,1,1,1,2018,3,2,4,...,0.12,5.6,6.1,2,1,0,1,,,
3,0x000527A13F12637CFD34616163AFBC48,0xB0E4800F18AD1367A01658AA1A7736EF,12may2014,2,1,0,2014,4,2,5,...,0.071,5.9,4.6,0,1,0,1,,,
4,0x000527A13F12637CFD34616163AFBC48,0x6A1A149F1B99B942F3C3938A85799151,21mar2016,2,2,1,2016,4,1,3,...,0.071,5.3,4.4,0,1,0,0,2.0,679.0,2.0


In [5]:
len(df)

117219

In [6]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# Set seed for reproducibility
np.random.seed(42)

# 1. Exploratory Data Analysis
print("Dataset shape:", df.shape)
print("\nTarget variable distribution:")
print(df['prediabetic'].value_counts(normalize=True) * 100)

# Check missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_data = pd.concat([missing_values, missing_percent], axis=1)
missing_data.columns = ['Missing Values', 'Percentage']
print("\nFeatures with >50% missing values:")
print(missing_data[missing_data['Percentage'] > 50].sort_values('Percentage', ascending=False).head())

# Visualize target distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='prediabetic', data=df)
plt.title('Distribution of Prediabetic Status')
plt.xlabel('Prediabetic (0=No, 1=Yes)')
plt.ylabel('Count')
plt.show()

# 2. Data Preprocessing
# Remove features with too many missing values (>50%) and non-predictive columns
cols_to_drop = missing_data[missing_data['Percentage'] > 50].index.tolist()
cols_to_drop += ['patientid', 'healthassessmentid', 'assessment_date']  # Non-predictive columns
X = df.drop(cols_to_drop + ['prediabetic'], axis=1)
y = df['prediabetic']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ]), categorical_cols)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Check for class imbalance and apply SMOTE if needed
class_counts = y_train.value_counts()
if class_counts[0] / class_counts[1] > 3 or class_counts[1] / class_counts[0] > 3:
    print("\nApplying SMOTE to handle class imbalance...")
    smote = SMOTE(random_state=42)
    X_train_preprocessed, y_train = smote.fit_resample(X_train_preprocessed, y_train)
    print("Class distribution after SMOTE:", np.bincount(y_train))

# 3. Model Building and Evaluation
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Calculate AUC-ROC if possible
    auc_roc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if auc_roc:
        print(f"AUC-ROC: {auc_roc:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # Feature importance (if available)
    if hasattr(model, 'feature_importances_'):
        feature_importance(model, X, preprocessor, model_name)
    
    return model, accuracy, precision, recall, f1, auc_roc

def feature_importance(model, X, preprocessor, model_name):
    if not hasattr(model, 'feature_importances_'):
        return
        
    # Get feature names
    feature_names = []
    for name, transformer, cols in preprocessor.transformers_:
        if name == 'cat':
            # Get the one-hot encoder
            ohe = transformer.named_steps['onehot']
            # Get all categories
            categories = ohe.categories_
            for i, category in enumerate(categories):
                feature_names.extend([f"{cols[i]}_{c}" for c in category])
        else:
            feature_names.extend(cols)
    
    # Get feature importances
    importances = model.feature_importances_
    
    # Create DataFrame of features and importances
    if len(importances) == len(feature_names):
        feature_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        feature_imp = feature_imp.sort_values('Importance', ascending=False)
        
        # Plot top 20 features
        plt.figure(figsize=(10, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_imp.head(20))
        plt.title(f'Top 20 Feature Importances - {model_name}')
        plt.tight_layout()
        plt.show()

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, class_weight='balanced', random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model_result = evaluate_model(model, X_train_preprocessed, y_train, X_test_preprocessed, y_test, name)
    results[name] = model_result

# Find best model based on AUC-ROC
best_model_name = max(results, key=lambda x: results[x][5] if results[x][5] is not None else 0)
best_model = results[best_model_name][0]
print(f"\nBest performing model: {best_model_name}")

# 4. ROC Curve Comparison
plt.figure(figsize=(10, 8))
for name, (model, _, _, _, _, _) in results.items():
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_preprocessed)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc = roc_auc_score(y_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()

# 5. Save best model results
best_metrics = {
    'model_name': best_model_name,
    'accuracy': results[best_model_name][1],
    'precision': results[best_model_name][2],
    'recall': results[best_model_name][3],
    'f1': results[best_model_name][4],
    'auc_roc': results[best_model_name][5]
}

print("\nBest Model Performance Summary:")
for metric, value in best_metrics.items():
    if metric != 'model_name' and value is not None:
        print(f"{metric}: {value:.4f}")

ModuleNotFoundError: No module named 'seaborn'