In [None]:
import numpy as np
import pandas as pd # data frame
import matplotlib.pyplot as plt # visualization
import seaborn as sns # visualization

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno # null data를 보여줌

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
df_train.head(10) # default = 5

In [None]:
df_train.shape

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
df_train.columns

In [None]:
for col in df_train.columns:
    msg = f'column : {col:>10}\t Percent of NaN value: {100*(df_train[col].isnull().sum()/df_train[col].shape[0]):.2f}%'
    print(msg)

In [None]:
for col in df_test.columns:
    msg = f'column : {col:>10}\t Percent of NaN value: {100*(df_test[col].isnull().sum()/df_test[col].shape[0]):.2f}%'
    print(msg)

In [None]:
df_train[col]

In [None]:
df_train[col].isnull()

In [None]:
df_train[col].isnull().sum()

In [None]:
df_train[col].shape

In [None]:
df_train[col].isnull().sum() / df_train[col].shape[0]

In [None]:
msno.matrix(df=df_train.iloc[:, :], figsize=(8, 8), color=(0.8, 0.5, 0.2)) # 분포를 확인 할 수 있다.

In [None]:
df_train.iloc[:, 1]

In [None]:
msno.bar(df=df_train.iloc[:, :], figsize=(8, 8), color=(0.8, 0.5, 0.2)) # 개수를 대략적으로 확인할 수 있음

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))

df_train['Survived']. value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

In [None]:
df_train['Survived']. value_counts()

## 2.1 Pclass

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()

### Pclass가 1일 때, 136명의 생존자/ 2일 때 87명/ 3일 때119명

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass']).sum()

In [None]:
df_train['Survived'].unique()

In [None]:
# margins : all column을 추가.
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='Blues')

In [None]:
# as_index : Pclass를 index로 한다.
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar()

In [None]:
# as_index가 False이면 두개가 모두 출력 된다.
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False).plot.bar()

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32', '#FFDF00', '#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of passengers By Pclass', y=y_position)
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()

## 2.2 sex

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [None]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar()

In [None]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')

## 2.2 Both Sex and Pclass

In [None]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

- Lady first,
- Money brings survival?

In [None]:
sns.factorplot(x='Sex', y='Survived', hue='Pclass', data=df_train, saturation=5, size=9, aspect=1)

In [None]:
sns.factorplot(x='Sex', y='Survived', col='Pclass', data=df_train, saturation=5, size=9, aspect=1)

## Age

In [None]:
print(f"제일 나이 많은 탑승객: {df_train['Age'].max():.1f} years.")
print(f"제일 어린 탑승객: {df_train['Age'].min():.1f} years.")
print(f"탑승객 평균 나이: {df_train['Age'].mean():.1f} years.")

### kdeplot : 데이터 분포를 그려주는 함수

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

### hist로 보면 아래와 같고 위의 파란선과 분포가 비슷하다.

In [None]:
df_train[df_train['Survived'] == 1]['Age'].hist()

### 다양한 indexing

In [None]:
df_train.iloc[0, :]

In [None]:
for row in df_train.iterrows():
    break
row

In [None]:
df_train[df_train['Survived'] == 1]['Age']

## 다양하게 그래프 그리는 법

In [None]:
f = plt.figure(figsize=(5, 5))
a = np.arange(100)
b = np.sin(a)
plt.plot(b)

In [None]:
f, ax = plt.subplots(1, 1, figsize=(5, 5))
a = np.arange(100)
b = np.sin(a)
ax.plot(b)

In [None]:
plt.figure(figsize=(5, 5))
a = np.arange(100)
b = np.sin(a)
plt.plot(b)

In [None]:
plt.figure(figsize=(15, 8))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')

plt.xlabel('Age')
plt.title('Age Distribution within classed')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

### hist로 하면 안보인다..

In [None]:
plt.figure(figsize=(15, 8))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='hist')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='hist')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='hist')

plt.xlabel('Age')
plt.title('Age Distribution within classed')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

### 생존확률을 보려면 아래와 같이 작성

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 1)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 1)]['Age'], ax=ax)
plt.title('1st class')
plt.legend(['Survived == 0', 'Survived == 1'])
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 2)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 2)]['Age'], ax=ax)
plt.title('2nd class')
plt.legend(['Survived == 0', 'Survived == 1'])
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(df_train[(df_train['Survived'] == 0) & (df_train['Pclass'] == 3)]['Age'], ax=ax)
sns.kdeplot(df_train[(df_train['Survived'] == 1) & (df_train['Pclass'] == 3)]['Age'], ax=ax)
plt.title('3rd class')
plt.legend(['Survived == 0', 'Survived == 1'])
plt.show()


In [None]:
# 1~80세까지의 생존률

change_age_range_survival_ratio = []
for i in range(1, 80):
    change_age_range_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))

plt.figure(figsize=(7, 7))
plt.plot(change_age_range_survival_ratio)
plt.title('Survival rate change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

In [None]:
# i = 10일 때, 아래 코드는 10살보다 적은 아이들의 생존률이다.

i = 10
df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived'])

In [None]:
len(df_train[df_train['Age'] < i]['Survived'])

# violinplot

In [None]:
f, ax = plt.subplots(1, 2, figsize =(18,8))
sns.violinplot("Pclass", "Age", hue="Survived", data=df_train, scale="count", split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot("Sex", "Age", hue="Survived", data=df_train, scale="count", split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0, 110, 10))

plt.show()

# Embarked

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7,7))
df_train[["Embarked", "Survived"]].groupby(["Embarked"], as_index=True).mean().sort_values(by="Survived", ascending=False).plot.bar(ax=ax)

In [None]:
# index로 정렬도 가능하다.
df_train[["Embarked", "Survived"]].groupby(["Embarked"], as_index=True).mean().sort_index()

In [None]:
# 오름 차순 정렬도 가능
df_train[["Embarked", "Survived"]].groupby(["Embarked"], as_index=True).mean().sort_values(by="Survived", ascending=True)

In [None]:
f, ax = plt.subplots(2, 2, figsize=(20, 15))
sns.countplot("Embarked", data=df_train, ax=ax[0, 0])

ax[0, 0].set_title("(1) No. Of Passengers Boared")

sns.countplot("Embarked", hue="Sex", data=df_train, ax=ax[0, 1])
ax[0, 1].set_title("(2) Male-Female split for embarked")

sns.countplot("Embarked", hue="Survived", data=df_train, ax=ax[1, 0])
ax[1, 0].set_title("(3) Embarked vs Survived")

sns.countplot("Embarked", hue="Pclass", data=df_train, ax=ax[1, 1])
ax[1, 1].set_title("(4) Embarked vs Pclass")

plt.subplots_adjust(wspace=0.4, hspace=0.5) # subplot간의 간격
plt.show()

# Family = SibSp + Parch

In [None]:
df_train["SibSp"]

In [None]:
df_train["Parch"]

In [None]:
# 자신도 포함해야 하므로 +1
df_train["FamilySize"] = df_train["SibSp"] + df_train["Parch"] + 1
df_train["FamilySize"]

In [None]:
print("Maximum size of Family: ", df_train["FamilySize"].max())
print("Minimum size of Family: ", df_train["FamilySize"].min())

In [None]:
f,ax=plt.subplots(1, 3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1) No. Of Passengers Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize',  y=1.02)

# 얘 왜 정렬안됨?
df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize',  y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

# Fare

In [None]:
# Skewness 값이 클수록 왼쪽으로 치우친 분포. 기준 값은 0이다.

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

In [None]:
# Fare에 log를 취해주어서 Skewness를 0으로 근사시켜준다.
# model의 성능을 향상시키기 위한 Feature Engineering

df_train["Fare"] = df_train["Fare"].map(lambda i: np.log(i) if i>0 else 0)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')

## Cabin은 Null 값이 80%이므로 중요한 정보를 얻을 수 없기 때문에 생략

# Ticket

In [None]:
# 종류가 다양해서 Feature Engineering으로 넘어가야 한다.
df_train["Ticket"].value_counts()