In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt 

import os

#print(os.listdir("../input/us-census-data"))

# Any results you write to the current directory are saved as output.

In [None]:
df_train = pd.read_csv("../input/us-census-data/adult-training.csv")
df_test = pd.read_csv("../input/us-census-data/adult-test.csv")

df_train.head()

In [None]:
df_train.columns=["age", "workclass", "fnlwgt","education", "education-num","marital_status",
                  "occupation","relationship","race", "gender","capital_gain","capital_loss",
                  "hours_per_week","native_country","income"]
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
#只有int和float数据
df_train.describe(include=['O'])

## 数据预处理
### 转为类别数据
将object数据转为类别数据，Ordinal Encoding to Categoricals

In [None]:
for feature in df_train.columns:
    if df_train[feature].dtype == 'object':
        df_train[feature] = pd.Categorical(df_train[feature]).codes

df_train.head()

In [None]:
df_train.info()

### 数据标准化
Standard Scalar

In [None]:


X_df = df_train.drop(df_train.columns[-1],1)
y_df = df_train.iloc[:,-1]

In [None]:
X = np.array(X_df)
y = np.array(y_df)

scaler = StandardScaler()
X =scaler.fit_transform(X)

## Feature Selection(特征选取)
- Use Decision TreeClassifier to choose the feature
- choose 10 feature(total 15 columns,last column is label)

In [None]:
from sklearn.feature_selection import RFE

tree = DecisionTreeClassifier(random_state=0)
tree.fit(X,y)

relval = tree.feature_importances_

lr = DecisionTreeClassifier()
names = X_df.columns.tolist()

selector = RFE(lr, n_features_to_select=10)
selector.fit(X, y.ravel())

print("feature after order: ", sorted(zip(map(lambda x:round(x,4), selector.ranking_), names)))

In [None]:
X_df_new = X_df.iloc[:,selector.get_support(indices = False)]
X_df_new.columns

In [None]:
## 切分数据集和测试集
X_new = scaler.fit_transform(np.array(X_df_new.astype(float)))
X_train, X_test, y_train, y_test = train_test_split(X_new,y)

## 建立Logistic Regression模型

In [None]:
from sklearn import metrics

lr = LogisticRegression()
lr_clf = lr.fit(X_train, y_train.ravel())
y_pred = lr_clf.predict(X_test)

print("Logistic Regression %s" % metrics.accuracy_score(y_test, y_pred))

## 建立XGBoost模型


- brute force scan for all parameters, here are the tricks
- usually max_depth is 6,7,8
- learning rate is around 0.05, but small changes may make big diff
- tuning min_child_weight subsample colsample_bytree can have much fun of fighting against overfit 
- n_estimators is how many round of boosting
- ensemble xgboost with multiple seeds may reduce variance

- learning rate:通过减少每一步的权重，可以提高模型的鲁棒性
- colsample_bytree,用来控制每棵随机采样的列数的占比(每一列是一个特征)
- max_depth,max_depth越大，模型会学到更具体更局部的样本
- min_child_weight 决定最小叶子节点样本权重和
- subsample 这个参数控制对于每棵树，随机采样的比例,一般0.5-1
- objective 目标函数的选择要根据问题确定，如果是回归问题 ，一般是 reg:linear , reg:logistic , count:poisson 如果是分类问题，一般是binary:logistic ,rank:pairwise
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower

### GridSearchCV(栅格搜索交叉验证）
调用sklearn.model_selection的GridSearchCV进行模型调参
- 需要传入4个参数
- 第1个参数是模型对象，
- 第2个参数是参数表格，数据类型为字典，
- 第3个关键字参数cv的数据类型是交叉验证对象，
- 第4个关键字参数scoring是字符串str或评分函数对象

In [None]:
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# 1. 调整max_depth和min_child-weight，步长为2
cv_params = {'max_depth': [3, 5, 7,9], 'min_child-weight': [1, 3, 5]}

ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'send':0, 
             'subsample': 0.8, 'colsample_bytree':0.8, 'objective': 'binary:logistic'}

xgbm_gsearch1 = GridSearchCV(estimator = XGBClassifier(ind_params), param_grid = cv_params,
                              cv = 5, scoring = 'accuracy', n_jobs = -1)

xgb_optimized_clf = xgbm_gsearch1.fit(X_train_std, y_train.ravel())

print('the best parameters: ', xgb_optimized_clf.best_params_)

means = xgb_optimized_clf.cv_results_['mean_test_score']
stds = xgb_optimized_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, xgb_optimized_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
# 看出最佳组合为:  {'max_depth': 7, 'min_child-weight': 1}
# 调整max_depth和min_child-weight，步长为1

cv_params = {'max_depth': [5, 6, 7], 'min_child-weight': [1, 2, 3]}

ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'send':0, 
             'subsample': 0.8, 'colsample_bytree':0.8, 'objective': 'binary:logistic'}

xgbm_gsearch1 = GridSearchCV(estimator = XGBClassifier(ind_params), param_grid = cv_params,
                              cv = 5, scoring = 'accuracy', n_jobs = -1)

xgb_optimized_clf = xgbm_gsearch1.fit(X_train_std, y_train.ravel())

print('the best parameters: ', xgb_optimized_clf.best_params_)

means = xgb_optimized_clf.cv_results_['mean_test_score']
stds = xgb_optimized_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, xgb_optimized_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
# 预测测试集数据
y_pred = xgb_optimized_clf.predict(X_test_std)


print(classification_report(y_test, y_pred))

- sklearn.metrics的classification_report函数用于显示主要分类指标的文本报告．在报告中显示每个类的准确率，召回率，F1值等信息,support列为每个标签的出现次数
- 可以看出，这次参数的准确率为85%，召回率为86%
- 根据得出的最佳参数为'max_depth': 7, 'min_child-weight': 1，继续修改参数

### 调XGBoost参数

In [None]:
# 调整 learning rate和subsample
cv_params_new = {'learning rate':[0.1, 0.05, 0.01], 'subsample': [i/100 for i in range(75, 90, 5)]}

xgbm_gsearch2 = GridSearchCV(estimator = XGBClassifier(max_depth=7, min_child_weight=1, n_estimators=1000, seed=0, 
                            colsample_bytree=0.8, objective='binary:logistic'),
                             param_grid = cv_params_new, scoring='accuracy', cv=5, n_jobs=-1)

xgb_optimized_clf = xgbm_gsearch2.fit(X_train_std, y_train.ravel())

print('the best parameters: ', xgb_optimized_clf.best_params_)

means = xgb_optimized_clf.cv_results_['mean_test_score']
stds = xgb_optimized_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, xgb_optimized_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
# 预测测试集数据
y_pred = xgb_optimized_clf.predict(X_test_std)


print(classification_report(y_test, y_pred))

### 防止过拟合
使用Early Stopping CV

In [None]:
import xgboost as xgb

xgb_matrix = xgb.DMatrix(X_train,y_train)

params_best = {'max-depth':7, 'min_child_weight':1, 'subsample':0.85, 'learning_rate':0.1, 
         'n_estimators':1000, 'seed':0, 'colsample_bytree':0.8, 'objective':'binary:logistic'}

cv_xgb = xgb.cv(params = params_best, dtrain = xgb_matrix, num_boost_round = 3000,
                nfold = 5, metrics=['error'], early_stopping_rounds = 100)

print(cv_xgb.tail(5))

In [None]:
final_xgb_model = xgb.train(params=params_best, dtrain=xgb.DMatrix(X_train,y_train), num_boost_round=1000)

## 在预测集上的结果
test_mat = xgb.DMatrix(X_test)
y_pred = final_xgb_model.predict(test_mat)
y_pred

In [None]:
#概率转为类别
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] =0

print("XGBoost accuracy is {0:.2%}".format(metrics.accuracy_score(y_test, y_pred)))

feature_importance_dict = final_xgb_model.get_fscore()
feature_importance_dict

In [None]:
# #修改dict的key值
# feature_importance_dict.update({'age':feature_importance_dict.pop('f0')})
# feature_importance_dict.update({'workclass':feature_importance_dict.pop('f1')})
# feature_importance_dict.update({'fnlwgt':feature_importance_dict.pop('f2')})
# feature_importance_dict.update({'education-num':feature_importance_dict.pop('f3')})
# feature_importance_dict.update({'occupation':feature_importance_dict.pop('f4')})
# feature_importance_dict.update({'relationship':feature_importance_dict.pop('f5')})
# feature_importance_dict.update({'capital_gain':feature_importance_dict.pop('f6')})
# feature_importance_dict.update({'capital_loss':feature_importance_dict.pop('f7')})
# feature_importance_dict.update({'hours_per_week':feature_importance_dict.pop('f8')})
# feature_importance_dict.update({'native_country':feature_importance_dict.pop('f9')})


%matplotlib inline
import seaborn as sns
sns.set(font_scale = 1.5)

xgb.plot_importance(final_xgb_model)
plt.show()

## 显示预测结果
### 分类报告

In [None]:
print(classification_report(y_test, y_pred))

可以看出准确率达到86%，召回率达到87%

### 绘制混淆矩阵

In [None]:
import itertools
from sklearn.metrics import confusion_matrix

def plt_confusion_matirx(cm, classes, title = "Confusion Matrix",cmap = plt.cm.Reds):
    plt.imshow(cm, interpolation="nearest", cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    
    thresh = cm.max()/2.0
    for i , j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j ,i , cm[i, j], horizontalalignment="center", 
                 color= "white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.xlabel("Predicted Classification")
    plt.ylabel("True Classification")

cm = confusion_matrix(y_test, y_pred)
class_names=[0, 1]
plt.figure()
plt_confusion_matirx(cm, classes=class_names, title="Confusion Matrix",cmap = plt.cm.Reds)
plt.show()