In [None]:
# 3. Visualize important relationships in the data
print("\n" + "="*50)
print("Step 3: Data Visualization")
print("="*50)

# Create a figure with multiple subplots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
fig.suptitle('Diabetes Dataset - Important Relationships', fontsize=16, fontweight='bold')

# 1. Distribution of the target variable
axes[0, 0].pie(df['diagnosed_diabetes'].value_counts(),
               labels=['No Diabetes', 'Diabetes'],
               autopct='%1.1f%%',
               colors=['lightgreen', 'lightcoral'])
axes[0, 0].set_title('Distribution of Diabetes Diagnosis')

# 2. Diabetes by Gender
gender_diabetes = pd.crosstab(df['gender'], df['diagnosed_diabetes'])
gender_diabetes.plot(kind='bar', ax=axes[0, 1], color=['lightgreen', 'lightcoral'])
axes[0, 1].set_title('Diabetes Diagnosis by Gender')
axes[0, 1].set_xlabel('Gender')
axes[0, 1].set_ylabel('Count')
axes[0, 1].legend(['No Diabetes', 'Diabetes'])

# 3. Diabetes by Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 70, 100], labels=['<30', '30-50', '50-70', '70+'])
age_diabetes = pd.crosstab(df['age_group'], df['diagnosed_diabetes'])
age_diabetes.plot(kind='bar', ax=axes[0, 2], color=['lightgreen', 'lightcoral'])
axes[0, 2].set_title('Diabetes Diagnosis by Age Group')
axes[0, 2].set_xlabel('Age Group')
axes[0, 2].set_ylabel('Count')
axes[0, 2].legend(['No Diabetes', 'Diabetes'])

# 4. BMI distribution by diabetes status
sns.boxplot(x='diagnosed_diabetes', y='bmi', data=df, ax=axes[1, 0], palette='Set2')
axes[1, 0].set_title('BMI Distribution by Diabetes Status')
axes[1, 0].set_xlabel('Diabetes Diagnosis (0=No, 1=Yes)')
axes[1, 0].set_ylabel('BMI')

# 5. Glucose levels by diabetes status
sns.boxplot(x='diagnosed_diabetes', y='glucose_fasting', data=df, ax=axes[1, 1], palette='Set2')
axes[1, 1].set_title('Fasting Glucose by Diabetes Status')
axes[1, 1].set_xlabel('Diabetes Diagnosis (0=No, 1=Yes)')
axes[1, 1].set_ylabel('Fasting Glucose')

# 6. HbA1c by diabetes status
sns.boxplot(x='diagnosed_diabetes', y='hba1c', data=df, ax=axes[1, 2], palette='Set2')
axes[1, 2].set_title('HbA1c Levels by Diabetes Status')
axes[1, 2].set_xlabel('Diabetes Diagnosis (0=No, 1=Yes)')
axes[1, 2].set_ylabel('HbA1c')

# 7. Correlation heatmap for key features
key_features = ['age', 'bmi', 'glucose_fasting', 'glucose_postprandial',
                'hba1c', 'cholesterol_total', 'triglycerides', 'diabetes_risk_score']
correlation_matrix = df[key_features + ['diagnosed_diabetes']].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, ax=axes[2, 0], cbar_kws={'shrink': 0.8})
axes[2, 0].set_title('Correlation Heatmap of Key Features')

# 8. Physical activity by diabetes status
sns.boxplot(x='diagnosed_diabetes', y='physical_activity_minutes_per_week',
            data=df, ax=axes[2, 1], palette='Set2')
axes[2, 1].set_title('Physical Activity by Diabetes Status')
axes[2, 1].set_xlabel('Diabetes Diagnosis (0=No, 1=Yes)')
axes[2, 1].set_ylabel('Physical Activity (min/week)')

# 9. Diabetes by family history
family_history = pd.crosstab(df['family_history_diabetes'], df['diagnosed_diabetes'])
family_history.plot(kind='bar', ax=axes[2, 2], color=['lightgreen', 'lightcoral'])
axes[2, 2].set_title('Diabetes Diagnosis by Family History')
axes[2, 2].set_xlabel('Family History of Diabetes (0=No, 1=Yes)')
axes[2, 2].set_ylabel('Count')
axes[2, 2].legend(['No Diabetes', 'Diabetes'])

plt.tight_layout()
plt.show()

# Additional visualizations
print("\nAdditional visualizations:")

# Scatter plot: BMI vs Glucose with diabetes status
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='glucose_fasting', hue='diagnosed_diabetes',
                data=df, palette='viridis', alpha=0.7)
plt.title('BMI vs Fasting Glucose Colored by Diabetes Status')
plt.xlabel('BMI')
plt.ylabel('Fasting Glucose')
plt.legend(title='Diabetes', labels=['No', 'Yes'])
plt.show()

# Feature importance using Random Forest (for visualization)
rf_for_importance = RandomForestClassifier(n_estimators=100, random_state=42)
rf_for_importance.fit(X_train_scaled, y_train)

# Get feature importances
importances = rf_for_importance.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[-15:]  # Top 15 features

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.barh(range(len(indices)), importances[indices], color='steelblue')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.title('Top 15 Feature Importances for Diabetes Prediction')
plt.tight_layout()
plt.show()