## Import libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, plot_confusion_matrix

## Import the data

For this class, we'll be exploring the UCI Adult dataset. The goal is to predict whether someone makes more than $50,000 a year or not. It is a classification problem.

The labels (`<=50K`/`>50K`) are in the `income` column. All the other columns can be used as input data. We have wide-ranging information on age, race, sex, education, occupation, etc.

In [77]:
col_names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]

data = pd.read_table('adult.data', sep = ',', index_col = False, names = col_names)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Drop the sentitive columns

(Run this entire notebook first, and record the accuracy when we're using the entire dataset. Then go back to this cell and write code to drop a few columns. Run the notebook again and compare the accuracy scores.)

In [78]:
# ---------------------------
# Choose 3 or more columns you think we should drop
# ---------------------------
columns_to_drop = [
    'marital-status',
    'fnlwgt', ## final weight
    'capital-loss'
]

# ---------------------------
# Write your code here to drop the columns you chose
# ---------------------------

for column in columns_to_drop:
    del data[column]

In [79]:
# Function to turn categorical data into binary columns
# More about OneHotEncoder here: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
def encode_column(dataframe, col_name):
    encoder = OneHotEncoder()
    encoded_column = encoder.fit_transform(dataframe[col_name].values.reshape(-1, 1)).toarray().T
    for index, category in enumerate(encoder.categories_[0]):
        dataframe[category] = np.array(encoded_column[index])

# These are the columns containing categorical data that we need to encode
columns_to_encode = [
    'workclass',
    'education',
    'marital-status',
    'fnlwgt',
    'capital-loss',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]

# There's no need to encode the columns we've dropped (they're gone!), so let's remove those from the list
for col_name in columns_to_drop:
    columns_to_encode.remove(col_name)

# Encode the remaining columns
for col_name in columns_to_encode:
    encode_column(data, col_name)
    data = data.drop(col_name, axis = 1)

In [80]:
# Split the dataframe into the inputs (X) and labels (y)
X = data.drop('income', axis = 1)
y = data['income'].apply(lambda x: 1 if x == ' >50K' else 0)

# Split them further into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [81]:
# Train a simple logistic regression model and print out the accuracy on the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)

# Print out the accuracy score
print('The accuracy score is:', accuracy_score(y_test, preds), '\n')

# Draw a confusion matrix
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(preds, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
df_confusion = df_confusion / df_confusion.sum(axis=1)
print(df_confusion)

The accuracy score is: 0.8353976865595251 

Predicted         0         1
Actual                       
0          0.806294  0.610335
1          0.261968  0.174581


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
