# Libraries

In [1]:
# Dataset.
import pandas as pd

# Algorithms.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

# Visualization.
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Loading Dataset

In [2]:
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income'
]

df = pd.read_csv('Data/adult.data', names = columns)
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# EDA

In [None]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

## Defining Models

In [5]:
models = {
    LogisticRegression(max_iter = 10000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(n_neighbors = 5),
    SVC()
}

## Model Evaluation

In [6]:
def model_evaluation(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred) * 100
    precision = precision_score(y_test, y_pred, zero_division = 1) * 100

    print(model.__class__.__name__)
    print(f'Test Accuracy: { accuracy:.2f}%')
    print(f'Test Precision: { precision:.2f}%')

    return {
        'Model' : model,
        'Prediction' : y_pred,
        'Accuracy' : accuracy,
        'Precision' : precision
    }

## Confusion Matrix

In [7]:
def plot_confusion_matrix(cm, model_name = str()):
    labels = ['<= 50K', '> 50K']

    # Annoting values inside cells.
    annotations = list()

    for y in range(len(labels)):
        for x in range(len(labels)):
            annotations.append(dict(
                x = labels[x],
                y = labels[y],
                text = str(cm[y][x]),
                showarrow = True,
                font = dict(color = 'black')
            ))

    go.Figure(data = go.Heatmap(
        z = cm,
        x = labels,
        y = labels,
        colorscale = 'Greens',
        colorbar = dict(title = 'Count'),
        hoverongaps = True,
    )).update_layout(
        annotations = annotations,
        title = f'Confusion Matrix: { model_name.__class__.__name__ }',
        xaxis_title = 'Predicted',
        yaxis_title = 'Actual',
        width = 640,
        height = 400
    ).show()

# Numerical Features Only

In [8]:
numeric_columns = list(df.describe().columns)
numeric_columns

['age',
 'fnlwgt',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_per_week']

In [9]:
X = df[numeric_columns]
y = df.income.apply(lambda x : 1 if x == ' >50K' else 0)
X, y

(       age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week
 0       39   77516             13          2174             0              40
 1       50   83311             13             0             0              13
 2       38  215646              9             0             0              40
 3       53  234721              7             0             0              40
 4       28  338409             13             0             0              40
 ...    ...     ...            ...           ...           ...             ...
 32556   27  257302             12             0             0              38
 32557   40  154374              9             0             0              40
 32558   58  151910              9             0             0              40
 32559   22  201490              9             0             0              20
 32560   52  287927              9         15024             0              40
 
 [32561 rows x 6 columns],
 0        0
 1        0

## Train/Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Standarizing Numeric Data

In [11]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)

In [12]:
numeric_models = list()

for model in models:
    try:
        numeric_models.append(model_evaluation(model, X_train, X_test, y_train, y_test))
        print()
    except ValueError:
        pass

SVC
Test Accuracy: 79.34%
Test Precision: 97.51%

DecisionTreeClassifier
Test Accuracy: 76.74%
Test Precision: 51.85%

KNeighborsClassifier
Test Accuracy: 77.18%
Test Precision: 54.95%

LogisticRegression
Test Accuracy: 81.35%
Test Precision: 70.09%

RandomForestClassifier
Test Accuracy: 80.41%
Test Precision: 61.38%



In [13]:
for model in numeric_models:
    plot_confusion_matrix(confusion_matrix(y_test, model['Prediction']), model['Model'])

# PCA + Feature Dimensionality Reduction

In [14]:
categorical_columns = [column for column in df.columns if column not in numeric_columns]
categorical_columns

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

## One-Hot Encoding

In [15]:
preprocessor = ColumnTransformer(transformers = [
    ('num', StandardScaler(), numeric_columns),
    ('cat', OneHotEncoder(drop = 'first'), categorical_columns)
])

In [16]:
def pipeline(model, n = 10):
    return Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = n)),
        ('classifier', model)
    ])

In [17]:
def model_evaluation(pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred) * 100
    precision = precision_score(y_test, y_pred) * 100

    print(pipeline.__class__.__name__)
    print(f'Accuracy: { accuracy:.2f}%')
    print(f'Precision: { precision:.2f}%')

    return {
        'Model' : pipeline,
        'Prediction' : y_pred,
        'Accuracy' : accuracy,
        'Precision' : precision
    }

In [18]:
fe_models = list()

for model in models:
    try:
        fe_models.append(model_evaluation(model, X_train, X_test, y_train, y_test))
        print()
    except ValueError:
        pass

SVC
Accuracy: 79.34%
Precision: 97.51%

DecisionTreeClassifier
Accuracy: 76.70%
Precision: 51.76%

KNeighborsClassifier
Accuracy: 77.18%
Precision: 54.95%

LogisticRegression
Accuracy: 81.35%
Precision: 70.09%

RandomForestClassifier
Accuracy: 80.45%
Precision: 61.59%



In [19]:
for model in fe_models:
    plot_confusion_matrix(confusion_matrix(y_test, model['Prediction']), model['Model'])

# Comparison

In [20]:
def compare_models(models, accuracies_1, precisions_1, accuracies_2, precisions_2):

    dataset = pd.DataFrame({
        'Model' : models,
        'Accuracy1' : accuracies_1,
        'Precision1' : precisions_1,
        'Accuracy2' : accuracies_2,
        'Precision2' : precisions_2
    })

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x = dataset['Model'],
        y = dataset['Accuracy1'],
        name = 'Numeric Models Accuracy',
        text = dataset['Accuracy1'],
        textposition = 'auto',
        marker_color = 'green'
    ))

    fig.add_trace(go.Bar(
        x = dataset['Model'],
        y = dataset['Precision1'],
        name = 'Numeric Models Precision',
        text = dataset['Precision1'],
        textposition = 'auto',
        marker_color = 'blue'
    ))

    fig.add_trace(go.Bar(
        x = dataset['Model'],
        y = dataset['Accuracy2'],
        name = 'Feature Reduction Accuracy',
        text = dataset['Accuracy2'],
        textposition = 'auto',
        marker_color = 'lightgreen'
    ))

    fig.add_trace(go.Bar(
        x = dataset['Model'],
        y = dataset['Precision2'],
        name = 'Feature Reduction Precision',
        text = dataset['Precision2'],
        textposition = 'auto',
        marker_color = 'lightblue'
    ))

    fig.update_layout(
        title = 'Model Comparison',
        xaxis_title = 'Models',
        yaxis_title = 'Score',
        yaxis = dict(range = [35, 100]),
        barmode = 'group',
        template = 'plotly_dark',
        width = 1280,
        height = 800
    )

    fig.show()

In [21]:
compare_models([model['Model'].__class__.__name__ for model in fe_models],
               [model['Accuracy'] for model in numeric_models],
               [model['Precision'] for model in numeric_models],
               [model['Accuracy'] for model in fe_models],
               [model['Precision'] for model in fe_models])

# Conclusion

##### Only a few models - `Decision Tree` and `Random Forest` have any noticible difference yet is not anymore than 0.20%.
##### This concludes that even after applying the dimensionality reduction, it is not feasible to achieve any significant difference in the accuracies.