In [2]:
! pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/uciml/adult-census-income")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

In [1]:
 #--- 1. Setup, Imports, and Data Loading ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [None]:
import pandas as pd

COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Note: You will need to place the 'adult.csv' file in the same directory (or upload it to Colab).
    # IMPORTANT: The next line is how you typically load data in Colab after uploading the file
data = pd.read_csv(
    '/content/adult-census-income/adult.csv',
    header=0, # Set header to 0 to use the first row as header
    names=COLUMNS,
    skipinitialspace=True,
    # Remove the next line to avoid duplicating the header row in the data
    # header=None,
)

data.drop_duplicates(inplace=True)
print(f"Data loaded successfully. Total records after cleaning: {len(data)}")
print("Initial Data Head:")
print(data.head())

In [None]:
# ---  Feature and Target Separation ---
le = LabelEncoder()
data['income_encoded'] = le.fit_transform(data['income'])

y = data['income_encoded']
X = data.drop(['income', 'income_encoded'], axis=1)

# Define feature types based on current dtypes
NUMERICAL_FEATURES = X.select_dtypes(include=np.number).columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes(include='object').columns.tolist()



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

In [None]:
# --- Manual Preprocessing (Impute, Scale, and Encode) ---

print("\n--- Step 3: Manual Preprocessing (Imputing, Scaling & Encoding) ---")

# A. Impute Missing Values (AFTER splitting to prevent data leakage)
# Impute Numerical Features with the mean (fit only on X_train)
num_imputer = SimpleImputer(strategy='mean')
X_train[NUMERICAL_FEATURES] = num_imputer.fit_transform(X_train[NUMERICAL_FEATURES])
X_test[NUMERICAL_FEATURES] = num_imputer.transform(X_test[NUMERICAL_FEATURES])

# Impute Categorical Features with the mode (fit only on X_train)
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[CATEGORICAL_FEATURES] = cat_imputer.fit_transform(X_train[CATEGORICAL_FEATURES])
X_test[CATEGORICAL_FEATURES] = cat_imputer.transform(X_test[CATEGORICAL_FEATURES])


# B. One-Hot Encode Categorical Features
# Instantiate the encoder and fit ONLY on the training data
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[CATEGORICAL_FEATURES])

# Transform both training and testing data
X_train_cat_encoded = pd.DataFrame(ohe.transform(X_train[CATEGORICAL_FEATURES]), columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES), index=X_train.index)
X_test_cat_encoded = pd.DataFrame(ohe.transform(X_test[CATEGORICAL_FEATURES]), columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES), index=X_test.index)

# C. Scale Numerical Features
# Instantiate the scaler and fit ONLY on the training data
scaler = StandardScaler()
scaler.fit(X_train[NUMERICAL_FEATURES])

# Transform both training and testing data
X_train_num_scaled = pd.DataFrame(scaler.transform(X_train[NUMERICAL_FEATURES]), columns=NUMERICAL_FEATURES, index=X_train.index)
X_test_num_scaled = pd.DataFrame(scaler.transform(X_test[NUMERICAL_FEATURES]), columns=NUMERICAL_FEATURES, index=X_test.index)

# D. Combine the processed features back into the final training/testing sets
X_train_final = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_test_final = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)

print(f"Final training feature shape: {X_train_final.shape}")
print(f"Final testing feature shape: {X_test_final.shape}")

In [None]:
# --- 4. Model Training and Comparison ---

trained_models = {}
all_metrics = []

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):

    print(f"\n--- Training {model_name} ---")
    model.fit(X_train, y_train)
    trained_models[model_name] = model

    # Make predictions and calculate metrics
    y_pred = model.predict(X_test)

    # Check if the model has predict_proba, otherwise use predict output
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        # Fallback for models without predict_proba (like some SVM configurations)
        y_proba = y_pred


    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    # Handle the case where y_test contains only one class for roc_auc_score
    if len(np.unique(y_test)) > 1:
        roc_auc = roc_auc_score(y_test, y_proba)
    else:
        roc_auc = np.nan # Or some other indicator that ROC AUC is not applicable


    print(f"\n--- {model_name} Evaluation ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # Focus on the metrics for the minority class (>50K)
    print("\nClassification Report (Focus on Precision/Recall for >50K):")
    report = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'], output_dict=True)
    print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))

    # Return key metrics for comparison
    return {'model': model_name, 'accuracy': accuracy, 'roc_auc': roc_auc, 'f1_50k_plus': report['>50K']['f1-score']}

In [None]:
# Model 1: Logistic Regression
lr_model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced', max_iter=1000)
metrics_lr = train_and_evaluate_model(lr_model, X_train_final, y_train, X_test_final, y_test, "Logistic Regression")
all_metrics.append(metrics_lr)
print("Logistic Regression Metrics:")
print(metrics_lr)

In [None]:
# Model 2: Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42, class_weight='balanced')
metrics_dt = train_and_evaluate_model(dt_model, X_train_final, y_train, X_test_final, y_test, "Decision Tree Classifier")
all_metrics.append(metrics_dt)
print("Decision Tree Classifier Metrics:")
print(metrics_dt)

In [None]:
# Final Comparison and Best Model Selection
print("\n" + "="*50)
print("FINAL MODEL PERFORMANCE COMPARISON TABLE")
print("="*50)

comparison_df = pd.DataFrame(all_metrics).set_index('model').sort_values(by='roc_auc', ascending=False)
print(comparison_df)

best_model_name = comparison_df.iloc[0].name
best_model = trained_models[best_model_name]

print(f"\n--> The BEST performing model (based on ROC AUC) is: {best_model_name}")

In [None]:
# --- 5. Custom Prediction (Answering the Problem Statement) ---

# Note: We can't easily predict a single raw dictionary with this manual method.
# To predict a new person, we must manually transform their features using the
# fitted scaler and OHE objects.

def predict_new_income_manual(model, new_data):
    """Predicts income class for a single new record using the best model and manual transformers."""

    # 1. Convert dictionary into a DataFrame (ensuring correct order)
    new_df = pd.DataFrame([new_data], columns=X.columns)

    # Ensure categorical columns are of 'object' dtype to prevent TypeError with isnan check in OneHotEncoder
    for col in CATEGORICAL_FEATURES:
        new_df[col] = new_df[col].astype('object')


    # 2. Separate numerical and categorical parts
    new_num = new_df[NUMERICAL_FEATURES]
    new_cat = new_df[CATEGORICAL_FEATURES]

    # 3. Apply the fitted transformers to the new data
    new_num_scaled = pd.DataFrame(scaler.transform(new_num), columns=NUMERICAL_FEATURES)
    new_cat_encoded = pd.DataFrame(ohe.transform(new_cat), columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES))

    # 4. Combine and finalize the feature set
    X_new_final = pd.concat([new_num_scaled, new_cat_encoded], axis=1)

    # 5. Make prediction
    prediction_class = model.predict(X_new_final)[0]
    prediction_proba = model.predict_proba(X_new_final)[0][1]

    # 6. Interpret result
    income_map = {0: '<=50K', 1: '>50K'}
    predicted_income = income_map[prediction_class]

    print("\n" + "="*50)
    print(f"PREDICTION RESULT (Using {best_model_name}):")
    print("="*50)
    print(f"Predicted Income Class: {predicted_income}")
    print(f"Probability of earning >$50K: {prediction_proba:.2f}")

# Example input for a new, single individual:
sample_person = {
    'age': 45,
    'workclass': 'Private',
    'fnlwgt': 140359, # Added missing column with example value
    'education': 'Bachelors', # Added missing column with example value
    'education-num': 14,
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Exec-managerial',
    'relationship': 'Husband',
    'race': 'White',
    'sex': 'Male',
    'capital-gain': 5000,
    'capital-loss': 0,
    'hours-per-week': 50,
    'native-country': 'United-States',
}

# Run the prediction function with the best model
predict_new_income_manual(best_model, sample_person)

In [None]:
# --- 6. Visualization and Reporting ---
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

print("\n" + "="*50)
print("--- Step 6: Visualization and Reporting ---")
print("="*50)

# 1. ROC AUC Comparison Plot
plt.figure(figsize=(10, 5))
sns.barplot(
    x=comparison_df.index,
    y=comparison_df['roc_auc'],
    hue=comparison_df.index,  # Assign x to hue
    palette='viridis',
    legend=False  # Set legend to False as suggested by the warning
)
plt.title('Model Performance Comparison (ROC AUC Score)')
plt.ylabel('ROC AUC Score')
plt.xlabel('Model')
plt.ylim(0.5, 1.0)
plt.show()


# 2. Confusion Matrix for the Best Model
# Assuming best_y_pred is available from a previous step (e.g., prediction on test set)
# If not, you would need to generate it here:
best_y_pred = best_model.predict(X_test_final)
cm = confusion_matrix(y_test, best_y_pred)
cm_df = pd.DataFrame(
    cm,
    index=['Actual <=50K', 'Actual >50K'],
    columns=['Predicted <=50K', 'Predicted >50K']
)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm_df,
    annot=True,
    fmt='d',
    cmap='Blues',
    cbar=False,
    linecolor='black',
    linewidths=0.5
)
plt.title(f'Confusion Matrix for {best_model_name}')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()