In [None]:
# ============================================
# SECTION 3.1 – DATASET DESCRIPTION AND ANALYSIS
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display
pd.set_option('display.max_columns', 100)
plt.style.use('seaborn-v0_8-whitegrid')

# Load dataset
df = pd.read_csv("AdultIncome.csv")

print("Dataset shape:", df.shape)
df.head()

In [None]:
print("\n--- Data Info ---")
df.info()

# Replace '?' with NaN in the entire DataFrame
df.replace('?', np.nan, inplace=True)

print("\n--- Missing Values ---")
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing.head(10)

In [None]:
# Clean up column names
df.columns = [col.strip().replace(".", "_") for col in df.columns]

# Print 5 columns per row
cols = df.columns.tolist()
for i in range(0, len(cols), 5):
    print(cols[i:i+5])

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='income', hue='income', data=df, palette='cool', legend=False)
plt.title("Class Distribution: <=50K vs >50K")
plt.xlabel("Income Category")
plt.ylabel("Count")
plt.show()

print(df['income'].value_counts(normalize=True))

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric features:", numeric_cols)

df[numeric_cols].describe().T

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='income', y='age', hue='income', data=df, palette='pastel', legend=False)
plt.title("Age Distribution by Income Group")
plt.xlabel("Income Category")
plt.ylabel("Age")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x='income', y='hours_per_week', hue='income', data=df, palette='pastel', legend=False)
plt.title("Weekly Working Hours by Income Group")
plt.xlabel("Income Category")
plt.ylabel("Hours per Week")
plt.show()

In [None]:
# ============================================
# SECTION 3.2 – DATA PRE-PROCESSING AND HANDLING IMBALANCE
# ============================================
# Count missing values
print("Missing values per column:")
print(df.isnull().sum()[df.isnull().sum() > 0])

# Drop rows with missing values
df.dropna(inplace=True)
print("\nAfter cleaning, dataset shape:", df.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Categorical columns:", cat_cols)

# Apply label encoding
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

print("\nAfter encoding:")
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Target class distribution (train):", y_train.value_counts(normalize=True))

In [None]:
from imblearn.over_sampling import SMOTE

# Apply Synthetic Minority Oversampling Technique (SMOTE)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before resampling:", y_train.value_counts())
print("After resampling:", y_train_resampled.value_counts())

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = X_train_resampled.copy()
X_test_scaled = X_test.copy()

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Apply scaling only to numeric columns
X_train_scaled[num_cols] = scaler.fit_transform(X_train_resampled[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

print("Numeric features scaled successfully.")

In [None]:
# ============================================
# SECTION 3.3 - FEATURE ENGINEERING
# ============================================

# Compute correlation with target (income)
corr = df.corr()['income'].sort_values(ascending=False)
print("Top correlated features with income:")
print(corr.head(10))

# Visualize correlations with income
plt.figure(figsize=(7,5))
corr.head(10).drop('income').plot(kind='barh', color='lightblue')
plt.title("Top Features Correlated with Income")
plt.xlabel("Correlation Coefficient")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Create total capital feature
df['total_capital'] = df['capital_gain'] - df['capital_loss']

# Create working_hours_group
df['working_hours_group'] = pd.cut(
    df['hours_per_week'],
    bins=[0, 35, 45, 60, 100],
    labels=['Part-time', 'Full-time', 'Overtime', 'Extreme']
)

# Encode new categorical feature
le = LabelEncoder()
df['working_hours_group'] = le.fit_transform(df['working_hours_group'].astype(str))
print(df[['hours_per_week', 'working_hours_group', 'total_capital']].head())

In [None]:
from lightgbm import LGBMClassifier

# Prepare dataset for feature importance evaluation
X = df.drop('income', axis=1)
y = df['income']

model = LGBMClassifier(random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot top 15 features
plt.figure(figsize=(7,5))
sns.barplot(x=importances.head(15), y=importances.head(15).index, color='lightgreen')
plt.title("Top 15 Important Features (LightGBM)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
# Select top features for model training
selected_features = [
    'education_num', 'capital_gain', 'hours_per_week',
    'age', 'marital_status', 'occupation', 'total_capital'
]

X_selected = df[selected_features]
y = df['income']

print("Selected feature set:", X_selected.columns.tolist())

In [None]:
# ============================================
# SECTION 3.4 – MODEL IMPLEMENTATION
# ============================================

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Split dataset again using selected features
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
# Train baseline Logistic Regression model
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_model.fit(X_train, y_train)

# Predictions
y_pred_log = log_model.predict(X_test)
y_proba_log = log_model.predict_proba(X_test)[:, 1]  # probability for ROC-AUC

# Evaluate metrics
acc_log  = accuracy_score(y_test, y_pred_log)
prec_log = precision_score(y_test, y_pred_log)
rec_log  = recall_score(y_test, y_pred_log)
f1_log   = f1_score(y_test, y_pred_log)
auc_log  = roc_auc_score(y_test, y_proba_log)

print(f"Logistic Regression → Accuracy: {acc_log:.3f}, Precision: {prec_log:.3f}, Recall: {rec_log:.3f}, "
      f"F1: {f1_log:.3f}, ROC-AUC: {auc_log:.3f}")

In [None]:
# Train tuned LightGBM Classifier
lgb_model = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)
lgb_model.fit(X_train, y_train)

# Predictions
y_pred_lgb = lgb_model.predict(X_test)
y_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

# Evaluate metrics
acc_lgb  = accuracy_score(y_test, y_pred_lgb)
prec_lgb = precision_score(y_test, y_pred_lgb)
rec_lgb  = recall_score(y_test, y_pred_lgb)
f1_lgb   = f1_score(y_test, y_pred_lgb)
auc_lgb  = roc_auc_score(y_test, y_proba_lgb)

print(f"LightGBM → Accuracy: {acc_lgb:.3f}, Precision: {prec_lgb:.3f}, Recall: {rec_lgb:.3f}, "
      f"F1: {f1_lgb:.3f}, ROC-AUC: {auc_lgb:.3f}")

In [None]:
# Combine results into summary table
results_cls = pd.DataFrame({
    'Model': ['Logistic Regression', 'LightGBM Classifier'],
    'Accuracy': [acc_log, acc_lgb],
    'Precision': [prec_log, prec_lgb],
    'Recall': [rec_log, rec_lgb],
    'F1-Score': [f1_log, f1_lgb],
    'ROC-AUC': [auc_log, auc_lgb]
})
print(results_cls)

In [None]:
# ============================================
# SECTION 3.5 – MODEL TUNING AND OPTIMIZATION
# ============================================

from sklearn.model_selection import GridSearchCV

# Parameter grid for Logistic Regression
param_grid_log = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2']
}

grid_log = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid=param_grid_log,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_log.fit(X_train, y_train)

print("Best Logistic Regression parameters:", grid_log.best_params_)
print("Best cross-validated ROC-AUC:", grid_log.best_score_)

# Evaluate tuned model
best_log = grid_log.best_estimator_
y_pred_log_tuned = best_log.predict(X_test)
y_proba_log_tuned = best_log.predict_proba(X_test)[:, 1]

acc_log_tuned  = accuracy_score(y_test, y_pred_log_tuned)
prec_log_tuned = precision_score(y_test, y_pred_log_tuned)
rec_log_tuned  = recall_score(y_test, y_pred_log_tuned)
f1_log_tuned   = f1_score(y_test, y_pred_log_tuned)
auc_log_tuned  = roc_auc_score(y_test, y_proba_log_tuned)

print(f"Tuned Logistic Regression → Accuracy: {acc_log_tuned:.3f}, Precision: {prec_log_tuned:.3f}, "
      f"Recall: {rec_log_tuned:.3f}, F1: {f1_log_tuned:.3f}, ROC-AUC: {auc_log_tuned:.3f}")

In [None]:
# Parameter grid for LightGBM
param_grid_lgb = {
    'num_leaves': [20, 31, 40],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200, 400, 600],
    'max_depth': [6, 8, 10]
}

grid_lgb = GridSearchCV(
    estimator=LGBMClassifier(random_state=42),
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_lgb.fit(X_train, y_train)

print("Best LightGBM parameters:", grid_lgb.best_params_)
print("Best cross-validated ROC-AUC:", grid_lgb.best_score_)

# Evaluate tuned model
best_lgb_cls = grid_lgb.best_estimator_
y_pred_lgb_tuned = best_lgb_cls.predict(X_test)
y_proba_lgb_tuned = best_lgb_cls.predict_proba(X_test)[:, 1]

acc_lgb_tuned  = accuracy_score(y_test, y_pred_lgb_tuned)
prec_lgb_tuned = precision_score(y_test, y_pred_lgb_tuned)
rec_lgb_tuned  = recall_score(y_test, y_pred_lgb_tuned)
f1_lgb_tuned   = f1_score(y_test, y_pred_lgb_tuned)
auc_lgb_tuned  = roc_auc_score(y_test, y_proba_lgb_tuned)

print(f"Tuned LightGBM → Accuracy: {acc_lgb_tuned:.3f}, Precision: {prec_lgb_tuned:.3f}, "
      f"Recall: {rec_lgb_tuned:.3f}, F1: {f1_lgb_tuned:.3f}, ROC-AUC: {auc_lgb_tuned:.3f}")

In [None]:
results_tuned = pd.DataFrame({
    'Model': ['Logistic Regression (Tuned)', 'LightGBM (Tuned)'],
    'Accuracy': [acc_log_tuned, acc_lgb_tuned],
    'Precision': [prec_log_tuned, prec_lgb_tuned],
    'Recall': [rec_log_tuned, rec_lgb_tuned],
    'F1-Score': [f1_log_tuned, f1_lgb_tuned],
    'ROC-AUC': [auc_log_tuned, auc_lgb_tuned]
})
print(results_tuned)

In [None]:
# ============================================
# SECTION 3.6 – EVALUATION AND PERFORMANCE METRICS
# ============================================

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Logistic Regression
cm_log = confusion_matrix(y_test, y_pred_log_tuned)
ConfusionMatrixDisplay(confusion_matrix=cm_log).plot(cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression (Tuned)")
plt.show()

# LightGBM
cm_lgb = confusion_matrix(y_test, y_pred_lgb_tuned)
ConfusionMatrixDisplay(confusion_matrix=cm_lgb).plot(cmap='Greens')
plt.title("Confusion Matrix - LightGBM (Tuned)")
plt.show()

In [None]:
from sklearn.metrics import roc_curve

fpr_log, tpr_log, _ = roc_curve(y_test, y_proba_log_tuned)
fpr_lgb, tpr_lgb, _ = roc_curve(y_test, y_proba_lgb_tuned)

plt.figure(figsize=(6,6))
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC = {auc_log_tuned:.3f})")
plt.plot(fpr_lgb, tpr_lgb, label=f"LightGBM (AUC = {auc_lgb_tuned:.3f})")
plt.plot([0,1],[0,1],'r--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

prec_log_curve, rec_log_curve, _ = precision_recall_curve(y_test, y_proba_log_tuned)
prec_lgb_curve, rec_lgb_curve, _ = precision_recall_curve(y_test, y_proba_lgb_tuned)

plt.figure(figsize=(6,6))
plt.plot(rec_log_curve, prec_log_curve, label="Logistic Regression")
plt.plot(rec_lgb_curve, prec_lgb_curve, label="LightGBM")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve Comparison")
plt.legend()
plt.show()

In [None]:
# ============================================
# SECTION 3.7 – MODEL EXPLAINABILITY
# ============================================

# Extract feature importance (coefficients)
coef = pd.Series(best_log.coef_[0], index=X_selected.columns)
coef_sorted = coef.sort_values()

plt.figure(figsize=(7,5))
sns.barplot(x=coef_sorted.values, y=coef_sorted.index, hue=coef_sorted.index, palette="coolwarm", legend=False)
plt.title("Feature Coefficients - Logistic Regression (Tuned)")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.show()

# Display top coefficients numerically
print(coef_sorted)

In [None]:
import shap

# Explain tuned LightGBM classifier
explainer = shap.TreeExplainer(best_lgb_cls)
shap_values = explainer.shap_values(X_selected)

# Global importance
shap.summary_plot(shap_values, X_selected, plot_type="bar", max_display=10)

In [None]:
# Examine the effect of education_num on predictions
shap.dependence_plot("education_num", shap_values, X_selected)

In [None]:
# ============================================
# SECTION 4.1 - CLASSIFICATION METRIC COMPARISON (BEFORE VS AFTER TUNING)
# ============================================

metrics_cls = pd.DataFrame({
    'Model': ['Logistic Regression', 'Logistic Regression (Tuned)', 'LightGBM', 'LightGBM (Tuned)'],
    'Accuracy': [acc_log, acc_log_tuned, acc_lgb, acc_lgb_tuned],
    'F1-Score': [f1_log, f1_log_tuned, f1_lgb, f1_lgb_tuned],
    'ROC-AUC': [auc_log, auc_log_tuned, auc_lgb, auc_lgb_tuned]
})

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_cls.plot(x='Model', y='Accuracy', kind='bar', color='lightgreen', ax=axes[0])
axes[0].set_title('Accuracy Comparison')
axes[0].set_ylim(0.7, 0.9)

metrics_cls.plot(x='Model', y='F1-Score', kind='bar', color='skyblue', ax=axes[1])
axes[1].set_title('F1-Score Comparison')

metrics_cls.plot(x='Model', y='ROC-AUC', kind='bar', color='salmon', ax=axes[2])
axes[2].set_title('ROC-AUC Comparison')

plt.suptitle('Classification Model Performance Before and After Tuning', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# ============================================
# SECTION 4.1 - CLASSIFICATION ACTUAL VS PREDICTED (CONFUSION MATRIX)
# ============================================

cm = confusion_matrix(y_test, y_pred_lgb_tuned)
ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap='Greens')
plt.title('LightGBM Classifier – Actual vs Predicted Income Labels')
plt.show()