- 前処理

# Library

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

import warnings
warnings.filterwarnings('ignore')

In [3]:
# sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn import metrics

# Load data

In [25]:
base_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
base_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# Config

In [26]:
VERSION = 'v1'
USAGE = 'tree'
OUT_DIR = '../intermediate/'

# Preprocessing

In [27]:
base_df2 = base_df.drop('Survived', axis=1)
base_df2['flg'] = 'train'

test_df2 = test_df.copy()
test_df2['flg'] = 'test'

df = pd.concat([base_df2, test_df2])

- 'Pclass':
 - 何もしない
- 'Name':
 - 名前に入っている身分を特徴量として利用
- 'Sex':
 - 何もしない
- 'Age':
 - 欠損値は-1に変換
 - 10刻みで階級化
- 'SibSp':
 - 何もしない
- 'Parch':
 - 何もしない
- 'Ticket':
 - チケットの文字数を特徴量として利用
- 'Fare':
 - 何もしない→対数化
- 'Cabin':
 - 船室名の長さを特徴量として利用
 - 船室名の頭文字を特徴量として利用
- 'Embarked':
 - 何もしない
- 以下を追加
 - is_alone
  - SipSpとParchから、一人かどうかを判定

In [28]:
def extract_title_from_name(x):
    sep_name = x.split('.')
    return sep_name[0].split(' ')[-1]

def without_sep_length(x):
    if x == None:
        return 0
    else:
        without_sep_x = str(x).replace(' ', '').replace('.', '').replace(',', '')
        return len(without_sep_x)
    
def initial_x(x):
    return str(x)[0]

In [43]:
df['name_title'] = df['Name'].map(extract_title_from_name)
df['ticket_length'] = df['Ticket'].map(without_sep_length)
df['cabin_length'] = df['Cabin'].map(without_sep_length)
df['cabin_initial'] = df['Cabin'].map(initial_x)
df['fixed_age'] = df['Age'].fillna(-1)
df['fixed_fare'] = df['Fare'].map(lambda x: np.log(x) if x > 0 else 0)

In [44]:
age_bins = list(range(0, 111, 10))
age_bins.append(-10)
age_bins.sort()

df['age_rank'] = df['Age'].fillna(-1)
df['age_rank'] = pd.cut(df['age_rank'], bins=age_bins)
df['age_rank'] = df['age_rank'].astype('str')

In [45]:
df['is_alone'] = df.apply(lambda x: 1 if x['SibSp'] + x['Parch'] == 0 else 0, axis=1)

In [46]:
df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,name_title,ticket_length,cabin_length,cabin_initial,fixed_age,fixed_fare,age_rank,is_alone
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,Mr,8,3,n,22.0,1.981001,"(20, 30]",0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,Mrs,7,3,C,38.0,4.266662,"(30, 40]",0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,Miss,14,3,n,26.0,2.070022,"(20, 30]",1


# Output

In [47]:
train_df = df.query('flg == "train"').drop('flg', axis=1).merge(base_df[['PassengerId', 'Survived']], on='PassengerId', how='left')
train_df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,name_title,ticket_length,cabin_length,cabin_initial,fixed_age,fixed_fare,age_rank,is_alone,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,8,3,n,22.0,1.981001,"(20, 30]",0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,7,3,C,38.0,4.266662,"(30, 40]",0,1


In [48]:
test_df = df.query('flg == "train"').drop('flg', axis=1)

In [49]:
train_df.to_csv(OUT_DIR + 'train_for_' + USAGE + '_' + VERSION + '.csv')
test_df.to_csv(OUT_DIR + 'test_for_' + USAGE + '_' + VERSION + '.csv')