In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_score, recall_score,accuracy_score,f1_score,roc_auc_score,mean_squared_error
from sklearn.metrics import log_loss
data=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\train.csv')

#注释一下列名
#survival	是否存活	0 = No, 1 = Yes
#pclass	票类型	1 = 1st, 2 = 2nd, 3 = 3rd
#sex	性别	
#Age	年龄	
#sibsp	泰坦尼克号上的兄弟姐妹/配偶	
#parch	# 泰坦尼克号上的父母/孩子	
#ticket	Ticket number	票号
#fare	票价	
#cabin	房间号	
#embarked	出发港	C =  瑟堡, Q = 昆士城, S = 南安普敦
data=data[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值

In [2]:
sex=LabelEncoder()
sex.fit(data['Sex'])
data['Sex']=sex.transform(data['Sex'])

embarked=LabelEncoder()
embarked.fit(data['Embarked'])
data['Embarked']=embarked.transform(data['Embarked'])
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [3]:
X=data[[x for x in data.columns if x not in ['Survived','PassengerId']]]
y=data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)
xgb_reg = xgb.XGBClassifier()

In [4]:
xgb_param = xgb_reg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=10, nfold=5,
             metrics={'error'}, seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False)
                       # ,xgb.callback.early_stop(3)
                       ])
xgb_reg.set_params(n_estimators=cvresult.shape[0])#把clf的参数设置成最好的树对应的参数

[0]	train-error:0.14184	test-error:0.19613
[1]	train-error:0.13024	test-error:0.19761
[2]	train-error:0.12687	test-error:0.19314
[3]	train-error:0.12313	test-error:0.18266
[4]	train-error:0.12014	test-error:0.18566
[5]	train-error:0.11977	test-error:0.18266
[6]	train-error:0.11527	test-error:0.18862
[7]	train-error:0.11041	test-error:0.18413
[8]	train-error:0.10891	test-error:0.17815
[9]	train-error:0.10704	test-error:0.17815




XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [5]:
xgb_reg.fit(X_train, y_train, eval_metric='auc')
y_pred = xgb_reg.predict(X_val)
y_score = xgb_reg.predict_proba(X_val)



In [6]:
print('测试集准确率:',accuracy_score(y_val, y_pred))
print('测试集精度:',precision_score(y_val, y_pred))
print('测试集召回率:',recall_score(y_val, y_pred))
print('auc:',roc_auc_score(y_val, y_score[:,1]))

测试集准确率: 0.852017937219731
测试集精度: 0.7945205479452054
测试集召回率: 0.7631578947368421
auc: 0.8885159326888651


In [7]:
log_loss(y_val, y_pred)

5.111173161405854

In [8]:
data_test=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\test.csv')
data_test=data_test[['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值
data_test['Sex']=sex.transform(data_test['Sex'])
data_test['Embarked']=embarked.transform(data_test['Embarked'])
data_test.head()

#Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
X_test=data_test[[x for x in data_test.columns if x not in ['PassengerId']]]

In [9]:
y_test_pre=xgb_reg.predict(X_test)
data_test['Survived']=y_test_pre
data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,1,34.5,0,0,7.8292,1,0
1,893,3,0,47.0,1,0,7.0,2,0
2,894,2,1,62.0,0,0,9.6875,1,0
3,895,3,1,27.0,0,0,8.6625,2,0
4,896,3,0,22.0,1,1,12.2875,2,0


In [None]:
data_test=data_test[['PassengerId','Survived']]
data_test.to_csv(r'D:\书籍资料整理\kaggle\titanic\output.csv',index=False)

array([0.12495992, 0.7304847 , 0.03251186, 0.04635083, 0.01429804,
       0.02679391, 0.02460068], dtype=float32)

In [11]:
for name, score in zip([x for x in X.columns], xgb_reg.feature_importances_):
    print(name, score)

Pclass 0.12495992
Sex 0.7304847
Age 0.032511856
SibSp 0.046350826
Parch 0.014298041
Fare 0.026793906
Embarked 0.024600685
