In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
print(pd.__version__)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import sklearn
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
print(sklearn.__version__)
print(matplotlib.__version__)

In [None]:
train_file = '/kaggle/input/spaceship-titanic/train.csv'
test_file = '/kaggle/input/spaceship-titanic/test.csv'
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

print(train_data.head())

# 数据分析
需要分析数据的分布和与结果的相关性，以及对空值的处理

## 数据信息展示

In [None]:
train_data.info()

## 数据空值分析

In [None]:
null_cols = train_data.columns[train_data.isnull().any()].tolist()
null_count_cols = train_data[null_cols].isnull().sum()

print("missing values in train data:")
print(null_cols)
plt.figure(figsize=(20,10))
plt.bar(null_cols, null_count_cols)
plt.show()

## 数值数据分析

### 数值分布分析

In [None]:
# 选取数值类型数据
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("numeric columns:", num_cols)

# 画出数值数据的分布图
for col in num_cols:
    plt.figure(figsize=(15,7))
    plt.title(col)
    plt.hist(train_data[col], bins=100)
    plt.show()
    

从上述结果来看，'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'均有大量0值，因此对于空值替换，应该用0值替换，而Age分布较为均匀，因此替换时可以使用平均值替换

### 相关性分析

In [None]:
corr_mat = train_data[train_data.columns].corr(method='spearman')
corr = corr_mat['Transported'].drop('Transported').sort_values(ascending=False)
print(corr)

可以看出数值数据相关性都不强

# 数据处理

首先PassengerId 中可以得到有效信息：group，我们可以认为，同一个group的乘客，HomePlanet 、 Destination 和 Cabin相同。这对处理空数据有很大帮助。

In [None]:
def fill_null_according_PassengerId(data:pd.DataFrame):
    data['Group'] = data['PassengerId'].apply(lambda x: x.split('_')[0])
    fill_cols = ['HomePlanet', 'Destination', 'Cabin']
    for col in fill_cols:
        # 将col中空值填充为与该行有相同Group的值，否则用最常见的值填充
        # data[col] = data.groupby('Group')[col].fillna(data[col].value_counts().idxmax())
        groupby_col = data.groupby('Group', dropna=False)[col]
        # print('group by', groupby_col.get_group('0064').value_counts().idxmax())
        # print('group by', groupby_col.get_group('0064'))
        for group in groupby_col.groups:
            if groupby_col.get_group(group).isnull().any():
                # 如果有空值
                if(groupby_col.get_group(group).any()):
                    data.loc[data['Group'] == group, col] = groupby_col.get_group(group).value_counts().idxmax()
                else:
                    data.loc[data['Group'] == group, col] = data[col].mode()[0]

'HomePlanet', 'Destination', 'Cabin' 的空值都被填充

其次 'Cabin' 也具有层次信息，deck和side对预测有帮助，可以新建列将数据分割

In [None]:
def split_cabin_info(data:pd.DataFrame):
    data['Deck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
    data['Side'] = data['Cabin'].apply(lambda x: x.split('/')[2])

之后填充其他数据的空值
* 'age' 数据用平均值填充
* 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' 使用0填充
* 'VIP' 中False压倒性的多，因此用False填充
* 'CryoSleep' 也是用占大多数的False填充

In [None]:
def fill_other_null(data:pd.DataFrame):
    fill_mean_cols = ['Age']
    for col in fill_mean_cols:
        data[col] = data[col].fillna(data[col].mean())
    fill_zero_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in fill_zero_cols:
        data[col] = data[col].fillna(0)
    fill_false_cols = ['CryoSleep', 'VIP']
    for col in fill_false_cols:
        data[col] = data[col].fillna(False)

'Name' 数据和 'PassengerId' 、'Group'在预测中没有用，因此直接删除该列，同时Cabin的有效信息也被分离，同样可以删除

In [None]:
def del_cols(data:pd.DataFrame):
    del_cols = ['PassengerId', 'Name', 'Group', 'Cabin']
    data.drop(del_cols, axis=1, inplace=True)

除了上述处理外，string类型数据，还要转化为数字类型

In [None]:
def preprocess(data:pd.DataFrame):
    new_data = data.copy()
    fill_null_according_PassengerId(new_data)
    split_cabin_info(new_data)
    fill_other_null(new_data)
    del_cols(new_data)
    for col in new_data.columns:
        if new_data[col].dtype == 'object' or new_data[col].dtype == 'bool':
            encoder = LabelEncoder()
            encoder.fit(new_data[col])
            new_data[col] = encoder.transform(new_data[col])
    return new_data

processed_train = preprocess(train_data)
print(processed_train.isnull().sum())
processed_test = preprocess(test_data)

In [None]:
corr_mat = processed_train[processed_train.columns].corr(method='spearman')
corr = corr_mat['Transported'].drop('Transported').sort_values(ascending=False)
print(corr)

# 学习

## 基础决策树

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=5)
print(processed_train)
train_data = processed_train.drop('Transported', axis=1)
clf.fit(train_data, processed_train['Transported'])
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("spaceship") 

In [None]:
result = clf.predict(processed_test)
id_col = test_data['PassengerId']
result_df = pd.DataFrame(result, columns=['Transported'])
result_df['PassengerId'] = id_col

result_df = result_df[['PassengerId', 'Transported']]
result_df.replace(0, 'False', inplace=True)
result_df.replace(1, 'True', inplace=True)
result_df.to_csv('/kaggle/working/dicision_tree_result.csv', index=False)

## SVM

In [None]:
SVC_clf = SVC(kernel='rbf', C=1, gamma=0.1)
SVC_clf.fit(train_data, processed_train['Transported'])
result = SVC_clf.predict(processed_test)
id_col = test_data['PassengerId']
result_df = pd.DataFrame(result, columns=['Transported'])
result_df['PassengerId'] = id_col
result_df = result_df[['PassengerId', 'Transported']]
result_df.replace(0, 'False', inplace=True)
result_df.replace(1, 'True', inplace=True)
result_df.to_csv('/kaggle/working/SVM_result.csv', index=False)

## Adaboost

In [None]:
Ada_clf = AdaBoostClassifier(n_estimators=100)
Ada_clf.fit(train_data, processed_train['Transported'])
result = Ada_clf.predict(processed_test)
id_col = test_data['PassengerId']
result_df = pd.DataFrame(result, columns=['Transported'])
result_df['PassengerId'] = id_col
result_df = result_df[['PassengerId', 'Transported']]
result_df.replace(0, 'False', inplace=True)
result_df.replace(1, 'True', inplace=True)
result_df.to_csv('/kaggle/working/Ada_result.csv', index=False)

## 神经网络

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device('cpu')

In [None]:
class NN(torch.nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.layer1 = nn.Linear(12, 20)
        self.layer2 = nn.Linear(20, 40)
        self.layer3 = nn.Linear(40, 2)
    def forward(self, input_tensor):
        out = self.layer1(input_tensor)
        out = F.relu(out)
        out = self.layer2(out)
        out = F.relu(out)
        out = self.layer3(out)
        return out

In [None]:
class MyDataSet(Dataset):
    def __init__(self, data:pd.DataFrame):
        self.data = data
        train_data = self.data.drop('Transported', axis=1)
        self.train_data = torch.from_numpy(train_data[7000:].values).float()
        label = self.data['Transported']
        self.label = torch.from_numpy(label[7000:].values).float()
    def __getitem__(self, index):
        return self.train_data[index], self.label[index]
    def __len__(self):
        return len(self.train_data)

In [None]:
def train(data:pd.DataFrame, epochs):
    dataset = MyDataSet(data)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    model = NN().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(epochs):
        for i, (data, label) in enumerate(dataloader):
            data = data.to(device)
            label = label.type(torch.LongTensor).to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print('epoch:', epoch, 'loss:', loss.item())
    return model

In [None]:
model = train(processed_train, 10)

In [None]:
eval_data = train_data[7000:]
eval_label = processed_train['Transported']
eval_label = eval_label[7000:]
length = eval_data.shape[0]
right = 0
for i in range(length):
    # evaluate model
    data = torch.from_numpy(eval_data.iloc[i].values).float().to(device)
    label = eval_label.iloc[i]
    output = model(data)
    predicted = torch.argmax(output, 0)
    if predicted == label:
        right += 1
print('acc:', right/length)

In [None]:
data = torch.from_numpy(processed_test.values).float().to(device)
predict = model(data)
predict = torch.argmax(predict, 1)
predict = predict.to('cpu')
predict = predict.numpy()

test_data = pd.read_csv(test_file)
id_col = test_data['PassengerId']
result_df = pd.DataFrame(predict, columns=['Transported'])
result_df['PassengerId'] = id_col
result_df = result_df[['PassengerId', 'Transported']]
result_df.replace(0, 'False', inplace=True)
result_df.replace(1, 'True', inplace=True)
result_df.to_csv('/kaggle/working/nn_result.csv', index=False)