In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_score, recall_score,accuracy_score,f1_score,roc_auc_score,mean_squared_error
from sklearn.metrics import log_loss
data=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\train.csv')

#注释一下列名
#survival	是否存活	0 = No, 1 = Yes
#pclass	票类型	1 = 1st, 2 = 2nd, 3 = 3rd
#sex	性别	
#Age	年龄	
#sibsp	泰坦尼克号上的兄弟姐妹/配偶	
#parch	# 泰坦尼克号上的父母/孩子	
#ticket	Ticket number	票号
#fare	票价	
#cabin	房间号	
#embarked	出发港	C =  瑟堡, Q = 昆士城, S = 南安普敦
data=data[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值

In [5]:
sex=LabelEncoder()
sex.fit(data['Sex'])
data['Sex']=sex.transform(data['Sex'])

embarked=LabelEncoder()
embarked.fit(data['Embarked'])
data['Embarked']=embarked.transform(data['Embarked'])
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [7]:
X=data[[x for x in data.columns if x not in ['Survived','PassengerId']]]
y=data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)
xgb_reg = xgb.XGBClassifier()

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

xgb_param = xgb_reg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)

model = xgb.XGBClassifier(n_estimators=15,max_depth=6)

param_grid = {'learning_rate': [0.005, 0.001, 0.01, 0.1, 0.2, 0.3]}
print(param_grid)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, y)
print(grid_search.best_params_)

{'learning_rate': [0.005, 0.001, 0.01, 0.1, 0.2, 0.3]}
{'learning_rate': 0.3}




In [12]:
import numpy as np
def custom_eval(preds, dtrain):
    pred_score = 1.0 / (1.0 + np.exp(-preds))
    pred = [1 if p > 0.5 else 0 for p in pred_score]
    lables = dtrain.get_label()
#     auc = roc_auc_score(lables, pred_score)
#     acc = accuracy_score(lables, pred)
    f1 = f1_score(lables, pred)
#     return [('accuracy', acc), ('auc', auc), ('f1', f1)]
    return [('f1', f1)]

In [18]:
xgb_param = xgb_reg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=50, nfold=5,
            metrics={'error'},
            feval=custom_eval, seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False)
                       # ,xgb.callback.early_stop(3)
                       ])
print(cvresult.shape[0])
xgb_reg.set_params(n_estimators=22,max_depth=5)#把clf的参数设置成最好的树对应的参数

[0]	train-error:0.15494	test-error:0.18716	train-f1:0.78665	test-f1:0.74697
[1]	train-error:0.14633	test-error:0.18567	train-f1:0.80128	test-f1:0.75137
[2]	train-error:0.14858	test-error:0.18569	train-f1:0.79923	test-f1:0.74799
[3]	train-error:0.14708	test-error:0.17820	train-f1:0.80299	test-f1:0.75964
[4]	train-error:0.14110	test-error:0.19016	train-f1:0.80996	test-f1:0.73950
[5]	train-error:0.13399	test-error:0.18417	train-f1:0.81951	test-f1:0.75282




[6]	train-error:0.13137	test-error:0.18267	train-f1:0.82238	test-f1:0.75408
[7]	train-error:0.13137	test-error:0.18417	train-f1:0.82271	test-f1:0.75025
[8]	train-error:0.12313	test-error:0.19014	train-f1:0.83471	test-f1:0.74400
[9]	train-error:0.12051	test-error:0.19015	train-f1:0.83899	test-f1:0.74417
[10]	train-error:0.11340	test-error:0.18417	train-f1:0.84897	test-f1:0.75413
[11]	train-error:0.11078	test-error:0.18568	train-f1:0.85178	test-f1:0.74930
[12]	train-error:0.10816	test-error:0.19166	train-f1:0.85520	test-f1:0.74021
[13]	train-error:0.10741	test-error:0.19465	train-f1:0.85513	test-f1:0.73591
[14]	train-error:0.10292	test-error:0.19166	train-f1:0.86182	test-f1:0.74115
[15]	train-error:0.10367	test-error:0.18716	train-f1:0.86123	test-f1:0.74886
[16]	train-error:0.09956	test-error:0.18865	train-f1:0.86723	test-f1:0.74769
[17]	train-error:0.09694	test-error:0.18566	train-f1:0.87091	test-f1:0.75196
[18]	train-error:0.09394	test-error:0.18265	train-f1:0.87486	test-f1:0.75700
[19

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=22, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [39]:
xgb_reg.fit(X_train, y_train, eval_metric='auc')
y_pred = xgb_reg.predict(X_val)
y_score = xgb_reg.predict_proba(X_val)



In [40]:
print('测试集准确率:',accuracy_score(y_val, y_pred))
print('测试集精度:',precision_score(y_val, y_pred))
print('测试集召回率:',recall_score(y_val, y_pred))
print('auc:',roc_auc_score(y_val, y_score[:,1]))

测试集准确率: 0.8565022421524664
测试集精度: 0.7972972972972973
测试集召回率: 0.7763157894736842
auc: 0.8972878625134264


In [41]:
log_loss(y_val, y_pred)

4.95629075604751

In [42]:
data_test=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\test.csv')
data_test=data_test[['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值
data_test['Sex']=sex.transform(data_test['Sex'])
data_test['Embarked']=embarked.transform(data_test['Embarked'])
data_test.head()

#Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
X_test=data_test[[x for x in data_test.columns if x not in ['PassengerId']]]

In [43]:
y_test_pre=xgb_reg.predict(X_test)
data_test['Survived']=y_test_pre
data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,1,34.5,0,0,7.8292,1,0
1,893,3,0,47.0,1,0,7.0,2,0
2,894,2,1,62.0,0,0,9.6875,1,0
3,895,3,1,27.0,0,0,8.6625,2,0
4,896,3,0,22.0,1,1,12.2875,2,0


In [44]:
data_test=data_test[['PassengerId','Survived']]
data_test.to_csv(r'D:\书籍资料整理\kaggle\titanic\output.csv',index=False)

In [45]:
for name, score in zip([x for x in X.columns], xgb_reg.feature_importances_):
    print(name, score)

Pclass 0.16923618
Sex 0.6247363
Age 0.039216634
SibSp 0.0680493
Parch 0.025624253
Fare 0.042688314
Embarked 0.03044906
