In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold

import os
import logging
import warnings

!pip install wget
import wget
files = ['Index', 'adult.data', 'adult.names', 'adult.test','old.adult.names']
for file in files:
  url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/' + file
  file = wget.download(url)



In [None]:
columns = ['age','workclass','fnlwgt','education','education-num',
           'marital-status','occupation','relationship','race','sex','capital-gain',
           'capital-loss','hours-per-week','native-country','income']
columns_int = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
columns_cat = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
features = ['age','workclass','fnlwgt','education','education-num',
           'marital-status','occupation','relationship','race','sex','capital-gain',
           'capital-loss','hours-per-week','native-country']

In [None]:
train = pd.read_csv('adult.data', header= None, names=columns)
train[columns_cat] = train[columns_cat].astype('category')
print(train.dtypes)
train.head()

age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
income            category
dtype: object


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
test = pd.read_csv('adult.test', header= 0, names=columns)
train_s = [' <=50K', ' >50K']
test_s = [' <=50K.', ' >50K.']
ss = dict(zip(test_s,train_s))
test.income = test.income.apply(lambda x: ss[x])
test[columns_cat] = test[columns_cat].astype('category')
print(test.dtypes)
test.head()

age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
income            category
dtype: object


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
s2 = {' Trinadad&Tobago': 49.47,
 ' Guatemala': 129.8,
 ' Laos': 15.44,
 ' Peru': 448.8,
 ' Thailand': 1467,
 ' Holand-Netherlands': 3791,
 ' Outlying-US(Guam-USVI-etc)': 0,
 ' Vietnam': 162.9,
 ' Ireland': 571.7,
 ' Ecuador': 227.1,
 ' Columbia': 817,
 ' Japan': 49070,
 ' France': 13940,
 ' Cuba': 284.5,
 ' Yugoslavia': 1200,
 ' Dominican-Republic': 146.6,
 ' Nicaragua': 38.63,
 ' Hungary': 431.7,
 ' Puerto-Rico': 396.9,
 ' Cambodia': 27.91,
 ' Jamaica': 54.19,
 ' Germany': 22050,
 ' Honduras': 46.42,
 ' Mexico': 5278,
 ' Haiti': 21.68,
 ' Greece': 1166,
 ' South': 1398,
 ' India': 3273,
 ' Philippines': 640.8,
 ' El-Salvador': 76.79,
 ' England': 12396.9,
 ' Poland': 1108,
 ' Canada': 5781,
 ' Taiwan': 305.2,
 ' Iran': 718.4,
 ' United-States': 72870,
 ' Scotland': 0,
 ' Portugal': 996.9,
 ' China': 5643,
 ' Italy': 10990,
 ' ?': 0,
 ' Hong': 1358.1}

In [None]:
train['native-country'] = train['native-country'].apply(lambda x: s2[x])
test['native-country'] = test['native-country'].apply(lambda x: s2[x])
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,72870.0,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,72870.0,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,72870.0,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,72870.0,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,284.5,<=50K


In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [None]:
list(train.education.unique())

[' Bachelors',
 ' HS-grad',
 ' 11th',
 ' Masters',
 ' 9th',
 ' Some-college',
 ' Assoc-acdm',
 ' Assoc-voc',
 ' 7th-8th',
 ' Doctorate',
 ' Prof-school',
 ' 5th-6th',
 ' 10th',
 ' 1st-4th',
 ' Preschool',
 ' 12th',
 nan]