# Titanic: Machine Learning from Disaster
Titanic のデータでデータ分析に慣れます

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## データの読み込み

In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## データを眺める

In [7]:
df = pd.concat([train, test], sort=True)
df.shape

(1309, 12)

In [8]:
df.dtypes

Age            float64
Cabin           object
Embarked        object
Fare           float64
Name            object
Parch            int64
PassengerId      int64
Pclass           int64
Sex             object
SibSp            int64
Survived       float64
Ticket          object
dtype: object

### 各 column の説明

| column | 説明 |
| --- | --- |
| Age | 年齢 |
| Cabin | 客席番号 |
| Embarked | 乗船した港 |
| Fare | 運賃 |
| Name | 名前 |
| Parch | 同乗した親/子供の数 |
| PassengerID | 乗客番号 |
| Pclass | 座席のクラス(1st class ~ 3rd class) |
| Sibsp | 同乗した兄弟/配偶者の数 |
| Survived | 生存者か否か(0, 1) |
| Ticket | チケット番号 |

ぱっと見不要そうな特徴
 - Cabin
 - PassengerID
 - Ticket


In [9]:
df.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [10]:
# 欠損値埋め
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
#　行の削除
df.drop(columns=['Cabin', 'Ticket'], inplace=True)

In [11]:
df['Embarked'].value_counts()

S    916
C    270
Q    123
Name: Embarked, dtype: int64

In [12]:
emb = {'S': 0, 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(lambda x: emb[x])

In [13]:
df['Sex'].value_counts()

male      843
female    466
Name: Sex, dtype: int64

In [14]:
df['is_male'] = (df['Sex']=='male').astype(int)
df.drop(columns=['Sex'], inplace=True)

In [15]:
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

In [16]:
df['Title'].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

In [17]:
df["NameLength"]=df["Name"].map(lambda x: len(x))
df.drop(columns=['Name'], inplace=True)

In [18]:
df['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Mme           1
Countess      1
Lady          1
Sir           1
Jonkheer      1
Don           1
Capt          1
Dona          1
Name: Title, dtype: int64

In [19]:
def title_xfer(title):
    titles = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4}
    if title in titles:
        return titles[title]
    else:
        return 0

df['Title'] = df['Title'].map(title_xfer)

In [23]:
# 確認
df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,SibSp,Survived,is_male,Title,NameLength
0,22.0,0,7.25,0,1,3,1,0.0,1,1,23
1,38.0,1,71.2833,0,2,1,1,1.0,0,3,51
2,26.0,0,7.925,0,3,3,0,1.0,0,2,22
3,35.0,0,53.1,0,4,1,1,1.0,0,3,44
4,35.0,0,8.05,0,5,3,0,0.0,1,1,24


In [24]:
df.dtypes

Age            float64
Embarked         int64
Fare           float64
Parch            int64
PassengerId      int64
Pclass           int64
SibSp            int64
Survived       float64
is_male          int64
Title            int64
NameLength       int64
dtype: object

数値に変換できている