In [49]:
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split

In [36]:
data = pd.read_feather('./data/uci_adult/raw_data.feather')
print(f'Dataframe shape: {data.shape}')
data.head(3)

Dataframe shape: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


## Immediate data partitioning

In [55]:
# get train set
data_train, data_rem = train_test_split(data, 
                                        test_size=5000, 
                                        shuffle=True, 
                                        random_state=2011)
# get validation, test sets
data_valid, data_test = train_test_split(data_rem, 
                                         test_size=0.5, 
                                         shuffle=True, 
                                         random_state=2011)

print(f'Number of obs in train,valid,test sets:')
print(f'({data_train.shape[0]}, {data_valid.shape[0]}, {data_test.shape[0]})')

with open('./data/uci_adult/train_valid_test.pkl','wb') as f:
    pickle.dump([data_train,data_valid,data_test],f)

Number of obs in train,valid,test sets:
(27561, 2500, 2500)


In [63]:
# Missing data
assert data_train.isnull().sum().sum()==0, 'Training data has missingness'
assert data_valid.isnull().sum().sum()==0, 'Validation data has missingness'
assert data_test.isnull().sum().sum()==0, 'Test data has missingness'

In [64]:
data_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
3502,40,Private,572751,Prof-school,15,Married-civ-spouse,Craft-repair,Husband,White,Male,5178,0,40,Mexico,>50K
28622,57,Private,201991,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
2102,30,Private,343699,HS-grad,9,Divorced,Craft-repair,Unmarried,White,Female,0,0,40,United-States,<=50K


In [72]:
# Handle duplicates
if sum(data_train.duplicated())>0:
    _ = sum(data_train.duplicated())
    print(f"{_} duplicates rows found, keeping first")
    data_train = data_train.drop_duplicates()

15 duplicates rows found, keeping first


In [75]:
assert sum(data_valid.duplicated())==0, 'Validation data has duplicates'
assert sum(data_test.duplicated())==0, 'Test data has duplicates'

In [76]:
data_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
3502,40,Private,572751,Prof-school,15,Married-civ-spouse,Craft-repair,Husband,White,Male,5178,0,40,Mexico,>50K
28622,57,Private,201991,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
2102,30,Private,343699,HS-grad,9,Divorced,Craft-repair,Unmarried,White,Female,0,0,40,United-States,<=50K


In [79]:
data_train.workclass.value_counts()

workclass
 Private             19215
 Self-emp-not-inc     2152
 Local-gov            1757
 ?                    1575
 State-gov            1088
 Self-emp-inc          958
 Federal-gov           783
 Without-pay            11
 Never-worked            7
Name: count, dtype: int64

In [81]:
data_train.education.value_counts()

education
 HS-grad         8854
 Some-college    6169
 Bachelors       4530
 Masters         1485
 Assoc-voc       1181
 11th             997
 Assoc-acdm       910
 10th             776
 7th-8th          540
 Prof-school      479
 9th              431
 12th             373
 Doctorate        362
 5th-6th          275
 1st-4th          140
 Preschool         44
Name: count, dtype: int64

In [82]:
data_train.education_num.value_counts()

education_num
9     8854
10    6169
13    4530
14    1485
11    1181
7      997
12     910
6      776
4      540
15     479
5      431
8      373
16     362
3      275
2      140
1       44
Name: count, dtype: int64

In [83]:
data_train.marital_status.value_counts()

marital_status
 Married-civ-spouse       12648
 Never-married             9039
 Divorced                  3778
 Separated                  861
 Widowed                    846
 Married-spouse-absent      354
 Married-AF-spouse           20
Name: count, dtype: int64

In [84]:
data_train.occupation.value_counts()

occupation
 Prof-specialty       3516
 Craft-repair         3475
 Exec-managerial      3414
 Adm-clerical         3200
 Sales                3094
 Other-service        2789
 Machine-op-inspct    1682
 ?                    1582
 Transport-moving     1342
 Handlers-cleaners    1154
 Farming-fishing       835
 Tech-support          798
 Protective-serv       534
 Priv-house-serv       125
 Armed-Forces            6
Name: count, dtype: int64

In [86]:
data_train.relationship.value_counts()

relationship
 Husband           11148
 Not-in-family      7086
 Own-child          4237
 Unmarried          2928
 Wife               1320
 Other-relative      827
Name: count, dtype: int64

In [87]:
data_train.sex.value_counts()

sex
 Male      18417
 Female     9129
Name: count, dtype: int64

In [91]:
data_train.hours_per_week.value_counts()

hours_per_week
40    12841
50     2364
45     1551
60     1262
35     1092
      ...  
94        1
61        1
87        1
92        1
81        1
Name: count, Length: 94, dtype: int64

In [94]:
data_train.native_country.value_counts()

native_country
 United-States                 24664
 Mexico                          528
 ?                               493
 Philippines                     172
 Germany                         121
 Canada                          104
 Puerto-Rico                      98
 El-Salvador                      86
 Cuba                             82
 India                            80
 England                          77
 Jamaica                          72
 South                            69
 Italy                            66
 China                            66
 Dominican-Republic               60
 Vietnam                          59
 Japan                            54
 Guatemala                        53
 Poland                           52
 Columbia                         49
 Taiwan                           45
 Haiti                            41
 Iran                             38
 Nicaragua                        32
 Portugal                         31
 France                

In [95]:
data_train.salary.value_counts()

salary
 <=50K    20902
 >50K      6644
Name: count, dtype: int64