Pre-process

In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing 

In [12]:
# read dataframe
train_data = pd.read_csv('./adult/adult.data', header= None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
test_data = pd.read_csv('./adult/adult.test', header= None, skiprows=1, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
originTest = test_data

print("Original data: Train", train_data.shape, ", Test", test_data.shape)

# drop duplicate value
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)

print("After Dropping: Train", train_data.shape, ", Test", test_data.shape)

Original data: Train (32561, 15) , Test (16281, 15)
After Dropping: Train (32537, 15) , Test (16276, 15)


In [13]:
# education and education-num have same meaning
train_data.drop(['education'], axis = 1, inplace = True)
test_data.drop(['education'], axis = 1, inplace = True)

# fnlwgt is not important feature
train_data.drop(['fnlwgt'], axis = 1, inplace = True)
test_data.drop(['fnlwgt'], axis = 1, inplace = True)

# remove the space
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# replace the NAN into mode value
train_data['workclass'] = train_data['workclass'].replace("?", train_data['workclass'].mode()[0])
train_data['occupation'] = train_data['occupation'].replace("?",train_data['occupation'].mode()[0])
train_data['native-country'] = train_data['native-country'].replace("?",train_data['native-country'].mode()[0])


test_data['workclass'] = test_data['workclass'].replace("?", test_data['workclass'].mode()[0])
test_data['occupation'] = test_data['occupation'].replace("?",test_data['occupation'].mode()[0])
test_data['native-country'] = test_data['native-country'].replace("?",test_data['native-country'].mode()[0])

train_data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
# check if there is Nan or not
train_data.isnull().sum(axis=0)

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [15]:
# Label Encoding
# tranfer the value of class(income) into int(1 or 0)
# >50K is 1, <=50K is 0
train_data['income'] = train_data['income'].apply(lambda x: 1 if x == ">50K" else 0)
test_data['income'] = test_data['income'].apply(lambda x: 1 if x == ">50K" else 0)

# One Hot Encoding (Dummies)
trainData_dum = pd.get_dummies(train_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
testData_dum = pd.get_dummies(test_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
pd.DataFrame(trainData_dum)

#在讀熱編碼後會依照有名目之欄位產生資料，train_data比test_data多出了該欄位，故將test_data新增該欄位，讓兩個資料集欄位相同。
testData_dum['native-country_Holand-Netherlands'] = 0

# Normalization(z-score)
numerical_columns = ['age','education-num','capital-gain','capital-loss']
scaler = preprocessing.StandardScaler()
trainData_dum[numerical_columns] = scaler.fit_transform(trainData_dum[numerical_columns])
testData_dum[numerical_columns] = scaler.transform(testData_dum[numerical_columns])


## replace "&" into "and" (for graph)
trainData_dum = trainData_dum.rename(columns={'native-country_Trinadad&Tobago': 'native-country_Trinadad_and_Tobago'})
testData_dum = testData_dum.rename(columns={'native-country_Trinadad&Tobago': 'native-country_Trinadad_and_Tobago'})


In [16]:
trainData_dum

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad_and_Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.030390,1.134777,0.148292,-0.216743,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.836973,1.134777,-0.145975,-0.216743,13,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.042936,-0.420679,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1.056950,-1.198407,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,-0.776193,1.134777,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849519,0.745913,-0.145975,-0.216743,38,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32557,0.103716,-0.420679,-0.145975,-0.216743,40,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32558,1.423579,-0.420679,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32559,-1.216148,-0.420679,-0.145975,-0.216743,20,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [17]:
testData_dum

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad_and_Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_Holand-Netherlands
0,-0.996171,-1.198407,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,-0.042936,-0.420679,-0.145975,-0.216743,50,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,-0.776193,0.745913,-0.145975,-0.216743,40,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.397019,-0.031815,0.894653,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,-1.509451,-0.031815,-0.145975,-0.216743,30,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.030390,1.134777,-0.145975,-0.216743,36,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
16277,1.863534,-0.420679,-0.145975,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
16278,-0.042936,1.134777,-0.145975,-0.216743,50,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
16279,0.397019,1.134777,0.592400,-0.216743,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [18]:
# put "class" to the last column

# column = td.pop("hours-per-week")
# td.insert(td.shape[1], "hours-per-week", column)

# columns = td.columns.tolist()

# t_column = ted.pop("hours-per-week")
# ted.insert(ted.shape[1], "hours-per-week", t_column)


td = trainData_dum
X_train = td.drop('hours-per-week', axis=1)
y_train = td['hours-per-week']

ted = testData_dum
X_test = ted.drop('hours-per-week', axis=1)
y_test = ted['hours-per-week']

In [19]:
import xgboost as xgb
import time

startT = time.time()

# build XGBRegressor model
xgbrModel = xgb.XGBRegressor()
# Training
xgbrModel.fit(X_train,y_train.ravel())

endT = time.time()
print('執行時間:', endT-startT)

執行時間: 3.1376051902770996


In [20]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Prediction
predicted = xgbrModel.predict(X_test)
  
r2 = r2_score(y_test, predicted)
print('R2 Score:{0}'.format(r2))

rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('RMSE:{0}'.format(rmse))

mape = mean_absolute_percentage_error(y_test, predicted)
print('MAPE:{0}'.format(mape))

R2 Score:0.20146421761885602
RMSE:11.150908020809547
MAPE:0.32979648224386615
