In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 预处理数据

In [None]:
# 加载数据
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
features = train_data.append(test_data, ignore_index=True)
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)

# 预览前5条数据
X_train.head()

In [None]:
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

In [None]:
X_train.info()

In [None]:
# 缺失值合计
X_train.isnull().sum()

# 待处理的缺失值
# X_train.Age
# X_train.Cabin
# X_train.Embarked
# X_train.Fare

In [None]:
import seaborn as sns

# 先看下数据集的 Age 分布状态
sns.histplot(X_train['Age'].dropna(), kde=True)

In [None]:
# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)

sns.histplot(X_train['Age'], kde=True)

In [None]:
# Cabin 的缺失值太多，从 Dataframe 中移除后，也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)

In [None]:
# 我们来看下乘客都在哪些站登船的
# S 表示：Southampton，英国南安普敦
# C 表示：Cherbourg-Octeville，法国瑟堡-奥克特维尔
# Q 表示：Queenstown，爱尔兰昆士敦
X_train.Embarked.value_counts()

In [None]:
# 登船情况
sns.countplot(x='Embarked', data=X_train)

In [None]:
X_train['Embarked'].replace(np.nan, 'S', inplace=True)

In [None]:
# 数据集有一个缺失数据，我们把它找出来，然后附上中值
X_train[np.isnan(X_train["Fare"])]

In [None]:
# 查询从 英国南安普敦 上传，级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)

# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare
# 查看这个为乘客
X_train.loc[X_train['PassengerId'] == 1044]

In [None]:
X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)

In [None]:
X_train.isnull().sum()

# 数据清洗与分割

In [None]:
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

In [None]:
X_train = pd.get_dummies(X_train)

# 预览 one-hot encoding 前5条数据
X_train.head()

In [None]:
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

In [None]:
train_X = X_train.iloc[:891, :]
test_X = X_train.iloc[891:, :]
train_y = y_train.iloc[:891]
test_y = y_train.iloc[891:]

In [None]:
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))

# 基于决策树模型预测

In [None]:
# 使用决策树预测模型
from sklearn.tree import DecisionTreeClassifier

# 创建决策树模型
def createDecisionTreeClassifier():
    model = DecisionTreeClassifier()

    # 训练模型
    model.fit(train_X, train_y)

    # 预测
    test_pred = model.predict(test_X)
    test_pred = test_pred.astype(int)
    return test_pred

# 基于逻辑回归模型预测

In [None]:
from sklearn.linear_model import LogisticRegression

# 创建逻辑回归预测模型
def createLogisticRegressionModel():
    # LogisticRegression 报错 AttributeError: ‘str’ object has no attribute ‘decode’
    # 解决办法将求解器指定为liblinear
    model = LogisticRegression(solver='liblinear')
    model.fit(train_X, train_y)
    
    # 预测
    test_pred = model.predict(test_X)
    test_pred = test_pred.astype(int)
    return test_pred

# 基于梯度提升分类器模型预测

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

def createGradientBoostingClassifierModel():
    model = GradientBoostingClassifier(n_estimators = 500)
    model.fit(train_X, train_y)
    
    # 预测
    test_pred = model.predict(test_X)
    test_pred = test_pred.astype(int)    
    return test_pred

# 基于神经网络模型预测

In [None]:
from sklearn.neural_network import MLPClassifier

# 创建多层感知器的预测模型
def createMLPClassifierModel():
    model = MLPClassifier(hidden_layer_sizes=128, batch_size=64, max_iter=1000, solver="adam")
    model.fit(train_X, train_y)
     
    # 预测
    test_pred = model.predict(test_X)
    test_pred = test_pred.astype(int)    
    return test_pred

# 基于Keras的神经网络模型预测

In [None]:
# Keras的神经网络模型来预测
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import utils as np_utils

def createKerasModel(X, y):
    
    # 创建模型
    model = Sequential()

    # 内核初始化器就使用截断正态分布
    initializers = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
    
    # 输入层维度是 X.shape[1]
    model.add(Dense(input_dim=X.shape[1], units=128, kernel_initializer=initializers, bias_initializer='zeros'))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))

    model.add(Dense(32))
    model.add(Activation("relu"))

    model.add(Dense(2))
    
    # 输出的结果是要么1，要么0，所以使用 sigmoid激活函数
    model.add(Activation("sigmoid"))

    # 编译使用二进制交叉熵，adam优化器自行调整
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # 将训练数据的y进行独热编码（one-hot encoding）
    y_train_categorical = np_utils.to_categorical(y)
    
    # 训练模型，epochs表示要训练150次，verbose表示训练每批次时输出日志信息
    model.fit(X.values, y_train_categorical, epochs=150, verbose=1)

    return model
   
keras_model = createKerasModel(train_X, train_y)


In [None]:
predictions_classes = np.argmax(keras_model.predict(test_X), axis=-1)

In [None]:
def output(predictions):
    output = pd.DataFrame({'PassengerId': test_X.PassengerId, 'Survived': predictions})
    output.to_csv('my_submission.csv', index=False)
    print("Your submission was successfully saved!")

# 预测乘客生还情况

In [None]:
# Decision Tree for testing data
# output(createDecisionTreeClassifier())

# Logistic Regression for testing data
# output(createLogisticRegressionModel())

# Gradient Boosting for testing data
output(createGradientBoostingClassifierModel())

# Neural Network classifier for testing data
# output(createMLPClassifierModel())

# Keras Model for testing data
# output(predictions_classes)