In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [16]:
import pandas as pd

path = "/content/drive/MyDrive/CSSM502/Assignment 6/Dataset/adult income.csv"
df = pd.read_csv(path)
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [18]:
import pandas as pd

print("Shape:", df.shape)
print("\nIncome distribution:")
print(df["income"].value_counts(dropna=False))
print("\nIncome distribution (ratio):")
print(df["income"].value_counts(normalize=True, dropna=False))

print("\nHow many '?' per column:")
q_counts = (df == "?").sum().sort_values(ascending=False)
print(q_counts[q_counts > 0])

print("\nColumns:", df.columns.tolist())


Shape: (48842, 15)

Income distribution:
income
<=50K    37155
>50K     11687
Name: count, dtype: int64

Income distribution (ratio):
income
<=50K    0.760718
>50K     0.239282
Name: proportion, dtype: float64

How many '?' per column:
occupation        2809
workclass         2799
native-country     857
dtype: int64

Columns: ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']


In [20]:
df_clean = df.replace("?", pd.NA)

df_clean = df_clean.dropna()

# make income target binary encode
df_clean["income"] = df_clean["income"].map({"<=50K": 0, ">50K": 1})

print("New shape:", df_clean.shape)
print("\nIncome distribution after cleaning:")
print(df_clean["income"].value_counts(normalize=True))


New shape: (45222, 15)

Income distribution after cleaning:
income
0    0.752156
1    0.247844
Name: proportion, dtype: float64


In [21]:
# Target
y = df_clean["income"]
X = df_clean.drop("income", axis=1)

# Numerical & categorical columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical columns:", numerical_cols)
print("\nCategorical columns:", categorical_cols)



Numerical columns: ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

Categorical columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (36177, 14)
Test shape: (9045, 14)


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Pipeline
log_reg_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

# Fit
log_reg_pipeline.fit(X_train, y_train)

# Predict
y_pred_lr = log_reg_pipeline.predict(X_test)

# Evaluation
acc_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print("Logistic Regression Results")
print("Accuracy:", acc_lr)
print("F1-score:", f1_lr)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr))



Logistic Regression Results
Accuracy: 0.845992260917634
F1-score: 0.658159509202454

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      6803
           1       0.73      0.60      0.66      2242

    accuracy                           0.85      9045
   macro avg       0.80      0.76      0.78      9045
weighted avg       0.84      0.85      0.84      9045



In [24]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree pipeline
dt_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", DecisionTreeClassifier(
            random_state=42
        ))
    ]
)

# Fit
dt_pipeline.fit(X_train, y_train)

# Predict
y_pred_dt = dt_pipeline.predict(X_test)

# Evaluation
acc_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

print("Decision Tree Results")
print("Accuracy:", acc_dt)
print("F1-score:", f1_dt)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dt))


Decision Tree Results
Accuracy: 0.8059701492537313
F1-score: 0.6172300981461287

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      6803
           1       0.60      0.63      0.62      2242

    accuracy                           0.81      9045
   macro avg       0.74      0.75      0.74      9045
weighted avg       0.81      0.81      0.81      9045



In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

rf_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            random_state=42,
            n_jobs=-1
        ))
    ]
)

# smaller grid
param_grid_fast = {
    "model__n_estimators": [200],
    "model__max_depth": [None, 20],
    "model__min_samples_split": [2, 5]
}


grid_search_fast = GridSearchCV(
    rf_pipeline,
    param_grid_fast,
    cv=3,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid_search_fast.fit(X_train, y_train)

print("Best parameters:", grid_search_fast.best_params_)
print("Best CV F1-score:", grid_search_fast.best_score_)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best CV F1-score: 0.6851136697444214


In [31]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

best_rf = grid_search_fast.best_estimator_

y_pred_rf = best_rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest (Tuned) Results")
print("Accuracy:", acc_rf)
print("F1-score:", f1_rf)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_rf))


Random Forest (Tuned) Results
Accuracy: 0.8551686014372581
F1-score: 0.6817298347910593

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      6803
           1       0.75      0.63      0.68      2242

    accuracy                           0.86      9045
   macro avg       0.82      0.78      0.79      9045
weighted avg       0.85      0.86      0.85      9045



In [32]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest (Tuned)"],
    "Accuracy": [acc_lr, acc_dt, acc_rf],
    "F1-score": [f1_lr, f1_dt, f1_rf]
})

results


Unnamed: 0,Model,Accuracy,F1-score
0,Logistic Regression,0.845992,0.65816
1,Decision Tree,0.80597,0.61723
2,Random Forest (Tuned),0.855169,0.68173
