## Exploratory Data Analysis (EDA) – Gradient Boosting Classifier Model

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='whitegrid')
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv("data/student_depression_dataset.csv")  # adjust path if needed
df.head()


In [None]:
# Check shape, info, nulls
print("Shape:", df.shape)
print("\nInfo:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())


In [None]:
# Drop nulls
df = df.dropna()
# Clean column names
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("?", "")
df.head()


In [None]:
# Target column distribution
sns.countplot(data=df, x='Depression')
plt.title('Depression Status Distribution')
plt.show()


In [None]:
# Summary stats
print(df.describe(include='all'))
# Separate categorical and numerical
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("\nCategorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


In [None]:
# Distribution plots for numerical columns
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, color='skyblue')
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
# Count plots for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index, palette='pastel')
    plt.title(f'Category Distribution: {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Boxplots for numerical columns vs target
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='Depression', y=col, palette='coolwarm')
    plt.title(f'{col} vs Depression')
    plt.show()


In [None]:
# Count plots of categorical vs target
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, hue='Depression', order=df[col].value_counts().index, palette='muted')
    plt.title(f'{col} vs Depression')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df.select_dtypes(include=['int64', 'float64']).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
