In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "./input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# 数据预览

In [None]:
data_train = pd.read_csv("./input/train.csv")
data_test = pd.read_csv("./input/test.csv")

# 分别输出训练数据和验证数据的基本信息。对数据的规模、各特征的数据类型及是否缺失有一个总体了解
data_train.info()  # 数据统计信息

data_test.info()

# data_train.describe() # 查看数值型列的汇总统计
# 总生存率

# 数据分析

# 特征选择

In [None]:
"""
from sklearn import feature_selection

# 通过交叉验证，筛选前%的特征。chi2（卡方检验）/f_classif
percentiles = range(1, 100, 2)
for percent in percentiles:
    fs = feature_selection.SelectPercentitle(feature_selection.chi2, percentile=percent)
    X_train_fs = fs.fit_transform(X_train, y_train)
    scores = cross_val_score(dt, X_train_fs, y_train, cv = 5)
    results = np.append(results, scores.mean())
print(results)
# 找到最佳性能的特征筛选百分比
opt = np.where(results == results.max())[0]
print("Optimal number of features {}".format(percentiles[opt]))

import pylab as pl
pl.plot(percentiles, results)
pl.xlabel("percentiles of features")
pl.ylabel("accuracy")
pl.show()
"""

In [None]:
selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']

X_train = data_train[selected_features]
X_test = data_test[selected_features]

y_train = data_train['Survived']

# 填补缺失值

In [None]:
# Embarked
print(X_train['Embarked'].value_counts())
print(X_test['Embarked'].value_counts())

# 对于这种类别型的特征，使用出现频率最高的特征值来填充
X_train['Embarked'].fillna('S', inplace=True) # 替换空值，inplace=True直接修改
X_test['Embarked'].fillna('S', inplace=True)

In [None]:
# Age

# 对于这种数值型的特征，用求平均值或中位数来填充
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)  # mean均值，median中位数
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)

# Fare
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

In [None]:
# 重新查验数据
print(X_train.info())
print(X_test.info())

# 特征向量化

In [None]:
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient="record"))
print(dict_vec.feature_names_)

X_test = dict_vec.transform(X_test.to_dict(orient='record'))

## 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 使用默认配置初始化
rfc = RandomForestClassifier()

# 使用交叉验证评估性能
from sklearn.cross_validation import cross_val_score
rfc_score = cross_val_score(rfc, X_train, y_train, cv=5).mean()  # 5折交叉验证，取得分的均值
print(rfc_score)

# 训练，预测
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)

# 存储
rfc_submission = pd.DataFrame({'PassengerId': data_test['PassengerId'], 'Survived': rfc_y_predict})
rfc_submission.to_csv("./output/rfc_submission.csv", index=False) # index:是否含列名

# xgboost

In [None]:
from xgboost import XGBClassifier

# 使用默认配置初始化
xgbc = XGBClassifier()

# 使用交叉验证评估性能
xgbc_score = cross_val_score(xgbc, X_train, y_train, cv=5).mean()
print(xgbc_score)

# 训练，预测
xgbc.fit(X_train, y_train)
xgbc_y_predict = xgbc.predict(X_test)

# 存储
xgbc_submission = pd.DataFrame({'PassengerId': data_test['PassengerId'], 'Survived': xgbc_y_predict})
xgbc_submission.to_csv("./output/xgbc_submission.csv", index=False)

# 使用并行网格搜索寻找超参数组合

In [None]:
from sklearn.grid_search import GridSearchCV

# python3中的range返回的是一个迭代值
params = {'max_depth': list(range(2, 7)), 
          'n_estimators': list(range(100, 1100, 200)),
          'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]}
xgbc_best = XGBClassifier()

# n_jobs:多CPU
gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

xgbc_best_y_predict = gs.predict(X_test)

# 存储
xgbc_best_submission = pd.DataFrame({'PassengerId': data_test['PassengerId'], 'Survived': xgbc_best_y_predict})
xgbc_best_submission.to_csv("./output/xgbc_best_submission.csv", index=False)