In [None]:
import pandas as pd

# Load the dataset
file_path = 'diabetes_data.csv'
data = pd.read_csv(file_path)

# Inspect the dataset to get a sense of its structure
data.head(), data.info(), data.describe(), data.shape

In [None]:
# Calculate missing values and their percentages
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100

# Create a summary DataFrame for missing values
missing_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentage
}).sort_values(by='Percentage (%)', ascending=False)


missing_summary

In [None]:
# Re-clean the dataset
numerical_columns = ['age', 'BMI', 'hypertension', 'diabetes_pedigree_function', 'weight', 'sleep_duration', 'family_diabetes' , 'pregnancies']
categorical_columns = ['gender', 'diet_type', 'social_media_usage', 'stress_level', 'physical_activity_level', 'alcohol_consumption']
target_column = 'diabetes'

# Impute missing values based on column types and importance

# 1. Drop columns with high missing percentages or less importance
columns_to_drop = ['star_sign']  # Subject to review based on importance
data_cleaned = data.drop(columns=columns_to_drop)

# 2. Impute numerical columns with mean
numerical_columns = ['age', 'BMI','hypertension','diabetes_pedigree_function', 'weight','family_diabetes', 'sleep_duration', 'pregnancies']
for col in numerical_columns:
    data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].mean())

# 3. Impute categorical columns with mode or "Unknown"
categorical_columns = ['gender', 'social_media_usage','diet_type', 'stress_level', 'physical_activity_level', 'alcohol_consumption']
for col in categorical_columns:
   data_cleaned[col] =  data_cleaned[col].fillna(data_cleaned[col].mode()[0])

# Impute missing values in categorical columns with "Unknown"
categorical_columns = ['gender', 'social_media_usage','diet_type', 'stress_level', 'physical_activity_level', 'alcohol_consumption']
for col in categorical_columns:
    data_cleaned[col].fillna("Unknown", inplace=True)

# 4. Drop rows with missing values in the target column (diabetes)
data_cleaned = data_cleaned[data_cleaned['diabetes'].notna()]

# Confirm changes
missing_cleaned_summary = data_cleaned.isnull().sum()
missing_cleaned_summary

# Shape of cleaned data and total missing values count
data_cleaned.shape, missing_cleaned_summary.sum()  

#file can be used here as well
# data_cleaned.to_csv('cleaned_diabetes_data.csv')

In [None]:

# Detect and remove outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

# Outlier analysis and removal for significant numerical variables
numerical_vars = ['age', 'BMI', 'diabetes_pedigree_function', 'weight', 'sleep_duration', 'pregnancies']
outliers_summary = {}

for column in numerical_vars:
    # Detect outliers using IQR
    outliers = detect_outliers_iqr(data, column)
    outliers_summary[column] = len(outliers)
    # Remove outliers from the dataset
    data_cleaned = data_cleaned[~data_cleaned.index.isin(outliers.index)]

# Create a summary of outliers removed
outliers_df = pd.DataFrame(list(outliers_summary.items()), columns=['Variable', 'Outliers Removed'])
print(outliers_df)
data_cleaned.to_csv('cleaned_diabetes_data.csv')


                     Variable  Outliers Removed
0                         age                 0
1                         BMI               523
2  diabetes_pedigree_function                 0
3                      weight                 0
4              sleep_duration              1936
5                 pregnancies                 0


In [None]:
import numpy as np
from scipy.stats import zscore
import matplotlib.pyplot as plt
# Visualize the distributions of cleaned numerical data
numerical_vars = ['age', 'BMI', 'diabetes_pedigree_function', 'weight', 'sleep_duration', 'pregnancies']

# Plot histograms for each numerical variable
for column in numerical_vars:
    plt.figure(figsize=(8, 5))
    plt.hist(data[column], bins=30, edgecolor='k', alpha=0.7)
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


In [None]:
# Compute correlations between numerical variables and the target (diabetes)
correlation_matrix = data[numerical_vars + ['diabetes']].corr()

# Extract correlations with the target variable (diabetes)
diabetes_correlation = correlation_matrix['diabetes'].sort_values(ascending=False)

# Visualize the correlations using a bar plot
plt.figure(figsize=(10, 6))
diabetes_correlation[:-1].plot(kind='bar', color='skyblue', edgecolor='k')
plt.title("Correlation of Variables with Diabetes")
plt.xlabel("Variables")
plt.ylabel("Correlation Coefficient")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

diabetes_correlation


In [None]:
# Compute the correlation matrix for all numerical variables
correlation_matrix = data[numerical_vars + ['diabetes']].corr()

# Create a heatmap to visualize correlations
plt.figure(figsize=(12, 8))
sns.heatmap(
    correlation_matrix, 
    annot=True, 
    fmt=".2f", 
    cmap="coolwarm", 
    square=True, 
    cbar=True
)
plt.title("Correlation Heatmap of Numerical Variables")
plt.tight_layout()
plt.show()


In [32]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical and categorical columns
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

# Apply imputers to respective columns
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# Retry train-test split after imputing missing values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Retry preprocessing and Logistic Regression pipeline
logreg_pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logreg_pipeline.predict(X_test)
y_pred_proba = logreg_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)

# Display results
results = {
    'Accuracy': accuracy,
    'ROC AUC': roc_auc,
    'Classification Report': classification_rep
}
results


NameError: name 'X' is not defined

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical and categorical columns
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

# Apply imputers to respective columns
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# Retry train-test split after imputing missing values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Retry preprocessing and Logistic Regression pipeline
logreg_pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logreg_pipeline.predict(X_test)
y_pred_proba = logreg_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)

# Display results
results = {
    'Accuracy': accuracy,
    'ROC AUC': roc_auc,
    'Classification Report': classification_rep
}
results


NameError: name 'X' is not defined