In [1]:
import os
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xlearn as xl

from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
seed = 42
np.random.seed(seed)

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

In [2]:
data = pd.read_csv("data/adult.data")
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','target']
# data.drop(columns=['fnlwgt'], inplace=True)
print(data.shape)
data.head()

(32560, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
data['target'].value_counts()/data.shape[0]

 <=50K    0.759183
 >50K     0.240817
Name: target, dtype: float64

### PREPROCESSING

Some preprocessing

### DROPPING THE NA VALUES

In [4]:
numerical_features = ['age','education-num','capital-gain','capital-loss','hours-per-week', 'fnlwgt']
categorical_features = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
print(len(numerical_features), len(categorical_features))

for f in categorical_features:
    data[f] = data[f].str.strip()

target_encoder = {data.target.unique()[0]:0,data.target.unique()[1]:1}
data['target'] = data['target'].map(target_encoder)

print(data.shape)
data.head()

6 8
(32560, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [5]:
x, y = train_test_split(data, test_size=0.2, random_state=seed, stratify=data.target.values)
x, z = train_test_split(x, test_size=0.1, random_state=seed, stratify=x.target.values)

train_indices = x.index.values
val_indices = z.index.values
test_indices = y.index.values
print("length of train data",len(x))
print("length of val data",len(z))
print("length of test data",len(y))

length of train data 23443
length of val data 2605
length of test data 6512


In [6]:
# dropping the target from the data
target = data['target']
data.drop(columns=['target'], inplace=True)

one_hot_encoded_data = pd.get_dummies(data,columns=categorical_features)
print(one_hot_encoded_data.shape)
one_hot_encoded_data.head()

(32560, 108)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native-country_?,native-country_Cambodia,native-country_Canada,native-country_China,native-country_Columbia,native-country_Cuba,native-country_Dominican-Republic,native-country_Ecuador,native-country_El-Salvador,native-country_England,native-country_France,native-country_Germany,native-country_Greece,native-country_Guatemala,native-country_Haiti,native-country_Holand-Netherlands,native-country_Honduras,native-country_Hong,native-country_Hungary,native-country_India,native-country_Iran,native-country_Ireland,native-country_Italy,native-country_Jamaica,native-country_Japan,native-country_Laos,native-country_Mexico,native-country_Nicaragua,native-country_Outlying-US(Guam-USVI-etc),native-country_Peru,native-country_Philippines,native-country_Poland,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,50,83311,13,0,0,13,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,38,215646,9,0,0,40,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,53,234721,7,0,0,40,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,28,338409,13,0,0,40,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,37,284582,14,0,0,40,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


## POLY2 

In [7]:
T1 = datetime.datetime.now()
pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

X = pf.fit_transform(one_hot_encoded_data.values)[:,one_hot_encoded_data.shape[1]:]

X_train, y_train = X[train_indices], target.iloc[train_indices].values
X_val, y_val = X[val_indices], target.iloc[val_indices].values
X_test, y_test = X[test_indices], target.iloc[test_indices].values

# standard scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

print("Training Shape",X_train.shape, y_train.shape)
print("Validation Shape",X_val.shape, y_val.shape)
print("Test Shape",X_test.shape, y_test.shape)

lr = LogisticRegression(penalty='l2', C=0.5, solver='saga')
print("Model Fitting")
lr.fit(X_train, y_train)

poly2_val_preds = lr.predict_proba(X_val)[:,1]
poly2_test_preds = lr.predict_proba(X_test)[:,1]

T2 = datetime.datetime.now()

print("-----Poly2-----")
print("log loss on validation data",log_loss(y_val, poly2_val_preds))
print("log loss on test data",log_loss(y_test, poly2_test_preds))
print("F1 score on validation data",f1_score(y_val, (poly2_val_preds>0.5).astype(int)))
print("F1 score on test data",f1_score(y_test, (poly2_test_preds>0.5).astype(int)))
print("Seconds =",round((T2-T1).total_seconds()))

Training Shape (23443, 5778) (23443,)
Validation Shape (2605, 5778) (2605,)
Test Shape (6512, 5778) (6512,)
Model Fitting
-----Poly2-----
log loss on validation data 0.35391623323978527
log loss on test data 0.33072202879757556
F1 score on validation data 0.662240663900415
F1 score on test data 0.6816806722689075
Seconds = 225


