In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import sys
# Adjust the path to include the Helpers directory
sys.path.append('../Helpers')

# Import custom functions from data_helpers.py
from data_helpers import load_config, load_data

# Load configuration settings from a JSON file
config = load_config('../config/config.json')  # Adjust path as needed

# Ensure the config was loaded successfully
if not config:
    raise Exception("Failed to load configuration.")

# Load the dataset based on the path specified in the configuration
data_path = config['data_path']
df = load_data(data_path)

# Ensure the data was loaded successfully
if diabetes_data is None:
    raise Exception("Failed to load the data.")

# Exploratory Data Analysis (EDA) insights (summarized for brevity)
# - The 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', and 'BMI' features have zero values that are biologically 
#- implausible and likely missing data.
# - The 'Age' and 'Pregnancies' features could provide interaction terms that are relevant for diabetes risk prediction.

# Handling Missing Values: Replace zeros with NaN for biologically implausible values
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col].replace(0, np.nan, inplace=True)

# Advanced imputation strategy: KNNImputer to account for multivariate aspects
imputer = KNNImputer(n_neighbors=5)

# Feature Creation based on domain knowledge and EDA
df['BMI_Age_Interaction'] = df['BMI'] * df['Age']  # Interaction term example

# Splitting the dataset into features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling and Polynomial Features to capture non-linear relationships
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)

# Pipeline for preprocessing and model training
pipeline = Pipeline(steps=[('imputer', imputer),
                           ('scaler', scaler),
                           ('poly', poly),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Training the model
pipeline.fit(X_train, y_train)

# Model Evaluation
predictions = pipeline.predict(X_test)
probabilities = pipeline.predict_proba(X_test)[:, 1]  # For ROC AUC score

# Classification report
print("Classification Report:\n", classification_report(y_test, predictions))

# Confusion Matrix
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# ROC AUC Score
roc_auc = roc_auc_score(y_test, probabilities)
print("ROC AUC Score:", roc_auc)



2024-02-27 13:10:20,315 - INFO - Configuration loaded successfully.
2024-02-27 13:10:20,347 - INFO - Data loaded successfully from ..//data//raw//diabetes.csv


NameError: name 'df' is not defined