In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_score, recall_score,accuracy_score,f1_score,roc_auc_score,mean_squared_error
from sklearn.metrics import log_loss
data=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\train.csv')
data=data[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值

In [2]:
data['Age']=data['Age'].fillna(0)

sex=LabelEncoder()
sex.fit(data['Sex'])
data['Sex']=sex.transform(data['Sex'])

embarked=LabelEncoder()
embarked.fit(data['Embarked'])
data['Embarked']=embarked.transform(data['Embarked'])
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [3]:
from sklearn.feature_selection import SelectKBest,VarianceThreshold,chi2,f_classif,f_regression,mutual_info_classif
#1.方差选择

X=data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y=data['Survived']
var_filter=VarianceThreshold()
var_filter.fit_transform(X)
#可以看到对于 分类变量方差本来就是比较小的,然后对方差较小的value count偏移的不是十分厉害
var_filter.variances_

array([6.98230591e-01, 2.28218083e-01, 8.00000000e+01, 1.21467827e+00,
       6.48999031e-01, 5.12329200e+02, 6.30094435e-01])

In [4]:
#带交叉验证
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)

print(selector.support_)
print(selector.ranking_)



[ True  True  True  True False  True False]
[1 1 1 1 2 1 3]


In [5]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR

estimator = GradientBoostingClassifier()
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X, y)
print(selector.support_)

print(selector.ranking_)


[ True  True  True False False  True False]
[1 1 1 2 3 1 4]


In [6]:
#可见RFECV和RFE效果差不多都是Embarked和Parch是有问题的.

data_1=data[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值
data_1.head()

X=data_1[[x for x in data_1.columns if x not in ['Survived','PassengerId','Embarked','Parch']]]
y=data_1['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)
xgb_reg = xgb.XGBClassifier()
xgb_param = xgb_reg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=10, nfold=5,
             metrics={'error'}, seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False)
                       # ,xgb.callback.early_stop(3)
                       ])
xgb_reg.set_params(n_estimators=cvresult.shape[0])#把clf的参数设置成最好的树对应的参数
xgb_reg.fit(X_train, y_train, eval_metric='auc')
y_pred = xgb_reg.predict(X_val)
y_score = xgb_reg.predict_proba(X_val)

[0]	train-error:0.15495	test-error:0.19763
[1]	train-error:0.14146	test-error:0.20064
[2]	train-error:0.13324	test-error:0.19914
[3]	train-error:0.12762	test-error:0.19762
[4]	train-error:0.12874	test-error:0.18864
[5]	train-error:0.12201	test-error:0.19162
[6]	train-error:0.11302	test-error:0.18716
[7]	train-error:0.11228	test-error:0.18863
[8]	train-error:0.11041	test-error:0.19313
[9]	train-error:0.10741	test-error:0.18564




In [7]:
print('测试集准确率:',accuracy_score(y_val, y_pred))
print('测试集精度:',precision_score(y_val, y_pred))
print('测试集召回率:',recall_score(y_val, y_pred))
print('auc:',roc_auc_score(y_val, y_score[:,1]))

测试集准确率: 0.8430493273542601
测试集精度: 0.7887323943661971
测试集召回率: 0.7368421052631579
auc: 0.8844432509846043


In [8]:
#生成
data_test=pd.read_csv(r'D:\书籍资料整理\kaggle\titanic\test.csv')
data_test=data_test[['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] #去掉可能的无关值
data_test['Sex']=sex.transform(data_test['Sex'])
data_test['Embarked']=embarked.transform(data_test['Embarked'])
data_test.head()
X_test=data_test[[x for x in data_test.columns if x not in ['PassengerId','Embarked']]]
y_test_pre=xgb_reg.predict(X_test)
data_test['Survived']=y_test_pre
data_test.head()
data_test=data_test[['PassengerId','Survived']]
data_test.to_csv(r'D:\书籍资料整理\kaggle\titanic\output.csv',index=False)