In [10]:
# data prep
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA, TruncatedSVD

# algorithmns
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# visualizations
import matplotlib.pyplot as plt
%matplotlib inline

In [93]:
# read in data

# through means of my own preparation, column names are saved in a txt file
with open('columns.txt','r') as cols:
    cols = cols.read().rstrip().split("\n")
    
# if necessary check cols with a print
# print(cols)

# read in train and test
train = pd.read_csv('census/census-income.data', na_values=['?',' ',' ?',' ?',' ? ',''], names=cols)
test = pd.read_csv('census/census-income.test', na_values=['?',' ',' ?',' ?',' ? ',''], names=cols)

# check shapes
for ds in [train, test]:
    print("Rows: {} Columns: {}".format(*ds.shape))

Rows: 199523 Columns: 42
Rows: 99762 Columns: 42


In [94]:
# fix target in training set
train.target = train.target.apply(lambda x: int(x.replace(".","").strip("+").replace(" ","")))
# looks good: type int and (-) are represented
train.target.sample(5)

72959   -50000
47147    50000
96629   -50000
48268   -50000
50924   -50000
Name: target, dtype: int64

In [13]:
# check the proportionals of the target variable
vc = pd.DataFrame(train.target.value_counts())
vc['%'] = np.round((vc['target'] / len(train) * 100), 2)
vc.loc['total'] = vc.sum()
# highly imbalanced dataset
vc

Unnamed: 0,target,%
-50000,187141.0,93.79
50000,12382.0,6.21
total,199523.0,100.0


In [86]:
# descriptive features that I think are important
def aboutData(df):
    t = pd.DataFrame(df.isnull().sum(), columns=['#NA'])
    t['#Count'] = len(df) - t['#NA']
    t['#Distinct'] = df.nunique()
    t['#NotInUniverse'] = df.apply(lambda x: np.sum(x == ' Not in universe'), axis=0)
    t['dtype'] = df.dtypes
    t['skew'] = df.skew()
    return t

# ignore - pd.concat if you'd like to put the entire dataset through the func and
# sanity checking myself
# pd.concat([train,test], ignore_index=True)
# for col in cols:
#     if train[col].dtype == 'object':
#         nius = len(train[train[col] == ' Not in universe'])
#         print("Column: {} \tNumber of NIU: {}".format(col, nius))

info = aboutData(train)
info

Unnamed: 0,#NA,#Count,#Distinct,#NotInUniverse,dtype,skew
age,0,199523,91,0,int64,0.37329
class of worker,0,199523,9,100245,object,
detailed industry recode,0,199523,52,0,int64,0.516688
detailed occupation recode,0,199523,47,0,int64,0.829238
education,0,199523,17,0,object,
wage per hour,0,199523,1240,0,int64,8.935097
enroll in edu inst last wk,0,199523,3,186943,object,
marital stat,0,199523,7,0,object,
major industry code,0,199523,24,0,object,
major occupation code,0,199523,15,100684,object,


In [87]:
# missing values and information
info.loc[info['#NA'] > 0]
# - high disproportions of 'Not in universe' values: most likely drop
# - half of migration code variables are missing: most likely drop
# - some NAs in the country of birth columns: impute/feature engineer

Unnamed: 0,#NA,#Count,#Distinct,#NotInUniverse,dtype,skew
state of previous residence,708,198815,50,183750,object,
migration code-change in msa,99696,99827,9,1516,object,
migration code-change in reg,99696,99827,8,1516,object,
migration code-move within reg,99696,99827,9,1516,object,
migration prev res in sunbelt,99696,99827,3,84054,object,
country of birth father,6713,192810,42,0,object,
country of birth mother,6119,193404,42,0,object,
country of birth self,3393,196130,42,0,object,


In [90]:
# Going to drop cols with too many 'not in universe' values
drops = info[info['#NotInUniverse'] > 80000].index.tolist()
drops
# Pause work on missing values for now
# Proceed with EDA

['class of worker',
 'enroll in edu inst last wk',
 'major occupation code',
 'member of a labor union',
 'reason for unemployment',
 'region of previous residence',
 'state of previous residence',
 'migration prev res in sunbelt',
 'family members under 18',
 "fill inc questionnaire for veteran's admin"]

In [128]:
# EDA
# columns into groups by types

# special cases that are int/float in the dataset, but should be treated as categorical
spcase = train.columns[train.columns.str.contains('recode')].tolist()
spcase.extend(['year','own business or self employed','veterans benefits'])

# extract dtypes from columns
nomial = train.select_dtypes(include=['object']).columns.tolist()
nomial.extend(spcase)
continuous = train.select_dtypes(include=['int','float']).columns.tolist()
continuous = [x for x in continuous if x not in spcase]

print("nomials ({}) : {}\n".format(len(nomial), nomial))
print("continuous ({}) : {}\n".format(len(continuous), continuous))

nomials (33) : ['class of worker', 'education', 'enroll in edu inst last wk', 'marital stat', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment', 'full or part time employment stat', 'tax filer stat', 'region of previous residence', 'state of previous residence', 'detailed household and family stat', 'detailed household summary in household', 'migration code-change in msa', 'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', "fill inc questionnaire for veteran's admin", 'detailed industry recode', 'detailed occupation recode', 'year', 'own business or self employed', 'veterans benefits']

continuous (9) : ['age', 'wage per hour', 'capital gains', 'capital losses', 'dividends from stocks', 'instance weight'

In [None]:
# nomial features by target
for col in 