## Statistical Learning and Deep Learning, 2020 Fall
### Homework 2

b06705208 資管四 朱紹瑜

### Q1
First, read the training data and the testing data.

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing

# read data
train = pd.read_csv('adult.data', header=None)
test = pd.read_csv('adult.test', header=None)

# assign column names
col_name = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 
           'marital-status', 'occupation', 'relationship', 'race', 'gender', 
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
train.columns = col_name
test.columns = col_name

# reorder columns
col_sequence = ['capital-loss', 'hours-per-week', 'capital-gain', 'educational-num', 'age', 'fnlwgt', 
              'relationship', 'race', 'gender', 'occupation', 'education', 'native-country', 'workclass', 
               'marital-status', 'income']
train = train[col_sequence]
test = test[col_sequence]

Then, remove NaNs.

In [2]:
train = train.replace(' ?', np.NaN).dropna()
test = test.replace(' ?', np.NaN).dropna()
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print(train.shape)
print(test.shape)

(30162, 15)
(15060, 15)


Split $x$, $y$. Convert '>50K' and '<=50K' into 1 and 0.

In [3]:
x_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1:]
x_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1:]

convert_train = {' >50K': 1, ' <=50K': 0}
convert_test = {' >50K.': 1, ' <=50K.': 0}
y_train['income'] = y_train['income'].map(convert_train)
y_test['income'] = y_test['income'].map(convert_test)
print(f'x_train: {x_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'x_test: {x_test.shape}')
print(f'y_test: {y_test.shape}')

x_train: (30162, 14)
y_train: (30162, 1)
x_test: (15060, 14)
y_test: (15060, 1)


For all categorical features, apply 1-of-K encoding.

In [4]:
x_train_cat = x_train.select_dtypes(include='object')
x_train_num = x_train.select_dtypes(include='int64')
x_test_cat = x_test.select_dtypes(include='object')
x_test_num = x_test.select_dtypes(include='int64')
print(x_train_cat.shape, x_train_num.shape)
print(x_test_cat.shape, x_test_num.shape)

x_train_cat = pd.get_dummies(x_train_cat)
x_train_cat.columns = x_train_cat.columns.str.replace(' ', '')
x_test_cat = pd.get_dummies(x_test_cat)
x_test_cat.columns = x_test_cat.columns.str.replace(' ', '')
print(x_train_cat.shape, x_train_num.shape)
print(x_test_cat.shape, x_test_num.shape)

(30162, 8) (30162, 6)
(15060, 8) (15060, 6)
(30162, 98) (30162, 6)
(15060, 97) (15060, 6)


Remove features that appeared less than 10 times.

In [5]:
x_train_cat = x_train_cat[x_train_cat.columns[x_train_cat.sum() >= 10]]
x_test_cat = x_test_cat[x_train_cat.columns]
print(x_train_cat.shape)
print(x_test_cat.shape)

(30162, 96)
(15060, 96)


Normalize numerical features.

In [6]:
x_train_val = x_train_num.to_numpy()
x_scaler = preprocessing.StandardScaler().fit(x_train_val)
x_train_val = x_scaler.transform(x_train_val)
x_train_num = pd.DataFrame(data=x_train_val, columns=x_train_num.columns)

x_test_val = x_scaler.transform(x_test_num.to_numpy())
x_test_num = pd.DataFrame(data=x_test_val, columns=x_test_num.columns)

Combine data into dictionary.

In [7]:
adult50 = {}
adult50['num_col'] = x_train_num.columns.to_list()
x_train = pd.concat([x_train_num, x_train_cat], axis=1)
x_test = pd.concat([x_test_num, x_test_cat], axis=1)
adult50['x_train'] = x_train.to_numpy()
adult50['x_test'] = x_test.to_numpy()
adult50['y_train'] = np.reshape(y_train.to_numpy(), (1, -1))
adult50['y_test'] = np.reshape(y_test.to_numpy(), (1, -1))
adult50['columnname'] = x_train.columns.tolist()
adult50['num_col'] = x_train_num.columns.to_list()

Compare our result with `adult_m50k.pickle`.

In [8]:
import pickle
dsfile = 'adult_m50k.pickle'
with open(dsfile, 'rb') as fh1:
    adult50kp = pickle.load(fh1)

print('test dictionary keys: ', adult50kp.keys() == adult50.keys())
print('test columnname: ', adult50kp['columnname'] == adult50['columnname'])
print('test num_col: ', adult50kp['num_col'] == adult50['num_col'])

elems = ['x_train', 'x_test', 'y_train', 'y_test']
for aelem in elems:
    cnomatch = np.sum(adult50kp[aelem] != adult50[aelem])
    if cnomatch == 0:
        print(aelem, "match!")
    else:
        print(aelem, "%d elements no match!" % cnomatch)

test dictionary keys:  True
test columnname:  [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
test num_col:  True
x_train match!
x_test match!
y_train match!
y_test match!
