

# Load Data


In [None]:
import pandas as pd

df = pd.read_csv('/content/adult.csv')
display(df.head())

# Explore Data
Visualize the key findings from the initial data exploration of the dataset at "/content/adult.csv".

In [None]:
display(df.info())
display(df.describe(include='all'))

In [None]:
numerical_cols = df.select_dtypes(include=['int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numerical variables suitable for visualization:")
for col in numerical_cols:
    print(f"- {col}")

print("\nCategorical variables suitable for visualization:")
for col in categorical_cols:
    print(f"- {col}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

categorical_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex']

for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, y=col, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Frequency')
    plt.ylabel(col)
    plt.show()

In [None]:
plt.figure(figsize=(14, 7))
top_countries = df['native.country'].value_counts().nlargest(10).index
sns.countplot(data=df[df['native.country'].isin(top_countries)], y='native.country', order=top_countries)
plt.title('Distribution of Top 10 Native Countries')
plt.xlabel('Frequency')
plt.ylabel('Native Country')
plt.show()

## Summary

### Data Analysis Key Findings

*   The dataset exhibits an imbalanced income distribution, with more individuals earning $\le$50K than >50K.
*   Higher education levels and being in a 'Married-civ-spouse' marital status are strongly associated with a higher likelihood of earning >50K.
*   Certain workclasses ('Self-emp-inc', 'Federal-gov') and occupations ('Exec-managerial', 'Prof-specialty') show a higher proportion of individuals earning >50K.
*   There is a notable gender income disparity, with a significantly higher proportion of males earning >50K compared to females.
*   Working more hours per week generally correlates with a higher chance of earning >50K.
*   Non-zero capital gains are strongly associated with having an income >50K.



In [None]:
display(df['income'].value_counts())

### Model 3 Preprocessing: Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv("/content/adult.csv")
df = df.replace("?", np.nan)
df = df.dropna()
y = df["income"]
X = df.drop("income", axis=1)
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Preprocessing completed.")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

### Model 3 Implementation: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=500, 
        solver="lbfgs",    
        n_jobs=-1  
    ))
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_logreg = log_reg_pipeline.predict(X_test)
y_prob_logreg = log_reg_pipeline.predict_proba(X_test)[:, 1]  # probabilty of >50K

print("Logistic Regression model trained and predictions generated.")

### Model 3 Evaluation: Logistic Regression

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_test, y_pred_logreg)
precision = precision_score(y_test, y_pred_logreg, pos_label=">50K")
recall = recall_score(y_test, y_pred_logreg, pos_label=">50K")
f1 = f1_score(y_test, y_pred_logreg, pos_label=">50K")

print("=== Logistic Regression Performance ===")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

cm = confusion_matrix(y_test, y_pred_logreg, labels=["<=50K", ">50K"])
print("\nConfusion Matrix (rows=true, cols=pred):")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg))

y_test_binary = y_test.map({"<=50K": 0, ">50K": 1})

fpr, tpr, thresholds = roc_curve(y_test_binary, y_prob_logreg)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f"LogReg AUC = {roc_auc:.3f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.grid(True)
plt.show()