In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Age、Embarked、Cabin都有为空值
data_train = pd.read_csv("./Kaggle_Titanic/train.csv")
#test.csv有一个Fare为空值
data_test = pd.read_csv("./Kaggle_Titanic/test.csv")

#合并两组数据以便同时进行特征工程
data_train['isTrain'] = 1
data_test['isTrain'] = 0
data = pd.concat([data_train, data_test], axis=0)

#将Cabin的内容转换为Cabin的有无
data.loc[(data.Cabin.notnull()), 'Cabin'] = "Yes"
data.loc[(data.Cabin.isnull()), 'Cabin'] = "No"

#填充唯一缺失的Fare值
data.loc[(data.Fare.isnull()), 'Fare'] = data.Fare.mean()

#随机森林预测，填充缺失值
age_df = data[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
y = known_age[:, 0]
X = known_age[:, 1:]
pipe = Pipeline([("rfr",RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1))])
pipe.fit(X, y)
predictedAges = pipe.predict(unknown_age[:, 1::])
data.loc[(data.Age.isnull()), 'Age'] = predictedAges

#类别特征的One-Hot编码
dummies_Cabin = pd.get_dummies(data['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data['Pclass'], prefix= 'Pclass')
data = pd.concat([data, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
data.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

#特征值放缩，若对整体进行StandardScaler，会对原先的布尔值产生影响，故不与后续步骤合并为一个Pipeline。
pipe = Pipeline([("scaler",StandardScaler(copy= False))])
pipe.fit_transform(data['Age'].values.reshape(-1, 1))
pipe.fit_transform(data['Fare'].values.reshape(-1, 1))

#选择用于训练与预测的数值列
data = data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|isTrain')

#训练集
train_data = data[data.isTrain == 1]
train_data = train_data.drop(['isTrain'], axis = 1)

#测试集
test_data = data[data.isTrain == 0]
test_data = test_data.drop(['isTrain', 'Survived'], axis = 1)

#训练集与验证集的切分，索引会被混排
X = train_data.drop(['Survived'], axis = 1)
y = train_data.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([("lr", LogisticRegression())])
param_grid = {
    'lr__penalty': ['l1', 'l2'],
    'lr__C' : [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
}
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X_train, y_train)

#bagging模型融合：预测值不是二维的，这里的做法是按0.5以上或0.5以下进行划分
bagging = BaggingRegressor(grid_search, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True,\
                           bootstrap_features=False, n_jobs=-1)
bagging.fit(X, y)
y_pred = bagging.predict(X_test)
y_pred[y_pred >= 0.5] = 1.
y_pred[y_pred < 0.5] = 0.
print('Accuracy: %.5f' % accuracy_score(y_test, y_pred))

Accuracy: 0.82090


In [19]:
t_result = bagging.predict(test_data)
t_result[t_result >= 0.5] = 1.
t_result[t_result < 0.5] = 0.
test_data['Survived'] = t_result
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':t_result.astype(np.int32)})
result.to_csv("./logistic_regression_predictions.csv", index=False)