![nn](10.png)

- 어머니 약혼자와 함께 1등실에 탑승한 17세 여성의 생존 확률은 얼마인가?
- 부모 형제 없이 혼자 3등실에 탑승한 19세 남성의 생존 확률은 얼마인가?

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
raw_data = pd.read_excel('data/titanic.xlsx')
raw_data.head()

In [None]:
raw_data.info()

In [None]:
raw_data.describe()

- PassengerId : 탑승자 번호
- Survived : 생존 여부
- Pclass : 선실 등급
- Age : 나이
- Sibsp : 함께 탑승한 형제자매나 배우자의 수
- parch : 합께 탑승한 부모나 자녀의 수
- fare : 탑승 당시의 요금

In [None]:
fig ,ax=plt.subplots(1,2,figsize=(12,6))

raw_data['Survived'].value_counts().plot.pie(explode=[0,0.1],
                                             autopct='%1.2f%%',ax=ax[0])
ax[0].set_title('Survived')
ax[0].set_ylabel('')

sns.countplot('Survived', data=raw_data,ax=ax[1])
ax[1].set_title('Survived')
plt.show()

In [None]:
raw_data['Age'].hist(bins=20,figsize=(18,8),grid=False);

In [None]:
raw_data.groupby('Pclass').mean()

In [None]:
raw_data.corr()


In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(raw_data.corr(), linewidths=0.01, square=True,
            annot=True, cmap=plt.cm.viridis, linecolor="white")
plt.title('Correlation between features')
plt.show()

- 나이별 및 성별 생존율

In [None]:
raw_data['age_cat'] = pd.cut(raw_data['Age'], 
                             bins=[0, 3, 7, 15, 30, 60, 100], 
                             include_lowest=True, 
                             labels=['baby', 'children', 'teenage', 
                                     'young', 'adult', 'old'])
raw_data.head()

In [None]:
raw_data.groupby('age_cat').mean()

In [None]:
plt.figure(figsize=[14,4])
plt.subplot(1,3,1)
sns.barplot('Pclass', 'Survived', data=raw_data)
plt.subplot(1,3,2)
sns.barplot('age_cat', 'Survived', data=raw_data)
plt.subplot(1,3,3)
sns.barplot('Sex', 'Survived', data=raw_data)
plt.subplots_adjust(top=1, bottom=0.1, left=0.10, right=1, hspace=0.5, wspace=0.5)
plt.show()

In [None]:
fig, ax=plt.subplots(1,2,figsize=(12,6))
sns.countplot('Sex',data=raw_data, ax=ax[0])
ax[0].set_title('Count of Passengers by Sex')

sns.countplot('Sex',hue='Survived',data=raw_data, ax=ax[1])
ax[1].set_title('Sex:Survived vs Dead')
plt.show()

- 귀족들의 생존율

In [None]:
raw_data['Name'][0]

In [None]:
raw_data['Name'][0].split(',')[1]

In [None]:
raw_data['Name'][0].split(',')[1].split('.')[0]

In [None]:
raw_data['Name'][0].split(',')[1].split('.')[0].strip()


# 람다함수

In [None]:
def test_func(v):
    return v+2

test_func(2)

In [None]:
test = lambda x: x+2
test(2)

In [None]:
list(map(test, range(5)))

In [None]:
conversion_rare = lambda x: x.split(',')[1].split('.')[0].strip()
raw_data['title'] = raw_data['Name'].map(conversion_rare)

titles = raw_data['title'].unique()
titles

In [None]:
pd.crosstab(raw_data['title'], raw_data['Sex'])

In [None]:
raw_data['title'] = raw_data['title'].replace('Mlle', 'Miss')
raw_data['title'] = raw_data['title'].replace('Ms', 'Miss')
raw_data['title'] = raw_data['title'].replace('Mme', 'Mrs')

Rare = ['Lady','the Countess','Countess','Capt', 'Master',
        'Col','Don','Dr','Major','Rev','Sir','Jonkheer', 'Dona']

for each in Rare:
    raw_data['title'] = raw_data['title'].replace(each, 'Rare')
    
raw_data['title'].unique()

In [None]:
print (raw_data[['title', 'Survived']].groupby(['title'], as_index=False).mean())

# 인공지능 학습

In [None]:
raw_data = pd.read_excel('data/titanic.xlsx')
raw_data.head()

In [None]:
tmp = []
for each in raw_data['Sex']:
    if each == 'female':
        tmp.append(0)
    elif each == 'male':
        tmp.append(1)
    else:
        tmp.append(np.nan)

In [None]:
raw_data['Sex'] = tmp
raw_data.head()

In [None]:
raw_data.describe()

In [None]:
raw_data.info()

In [None]:
raw_data = raw_data[raw_data["Age"].notna()]
raw_data.info()

In [None]:
raw_data['Survived'] = raw_data['Survived'].astype('float')
raw_data['Pclass'] = raw_data['Pclass'].astype('float')
raw_data['Sex'] = raw_data['Sex'].astype('float')
raw_data['SibSp'] = raw_data['SibSp'].astype('float')
raw_data['Parch'] = raw_data['Parch'].astype('float')
raw_data['Fare'] = raw_data['Fare'].astype('float')
raw_data.head()

In [None]:
aw_data = raw_data[raw_data['Age'].notnull()]
raw_data = raw_data[raw_data['SibSp'].notnull()]
raw_data = raw_data[raw_data['Parch'].notnull()]
raw_data = raw_data[raw_data['Fare'].notnull()]
raw_data.info()

# 생존자 예측을 위한 모델 수립

In [None]:
train_pre = raw_data[['Pclass','Sex','Age','SibSp','Parch','Fare']]
train_pre.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_pre, 
                                                    raw_data[['Survived']], 
                                                    test_size=0.1, 
                                                    random_state=13)

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [None]:
y_test.info()

In [None]:
X_train.head()

In [None]:
y_train

In [None]:
X_train = X_train.reset_index()
X_train = X_train.drop(['index'], axis=1)

X_test = X_test.reset_index()
X_test = X_test.drop(['index'], axis=1)

y_train = y_train.reset_index()
y_train = y_train.drop(['index'], axis=1)

y_test = y_test.reset_index()
y_test = y_test.drop(['index'], axis=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=3, random_state=13)
tree_clf.fit(X_train, y_train)

print('Score: {}'.format(tree_clf.score(X_train, y_train)))

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file="titanic.dot",
        feature_names=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'],
        class_names=['Unsurvived','Survived'],
        rounded=True,
        filled=True
    )

import graphviz
with open("titanic.dot") as f:
    dot_graph = f.read()
dot = graphviz.Source(dot_graph)
dot.format = 'png'
dot.render(filename='titanic_tree', directory='decision_trees', cleanup=True)
dot

In [None]:
from sklearn.metrics import accuracy_score

y_pred = tree_clf.predict(X_test)
print("Test Accuracy is ", accuracy_score(y_test, y_pred)*100)

# 생존자 예측

In [None]:
# pclass, sex, age, sibsp, parch, fare
dicaprio = [3., 1., 19., 0., 0., 5.]
winslet = [1., 0., 17., 1., 1., 100.]

In [None]:
tree_clf.predict_proba([winslet])

In [None]:
tree_clf.predict_proba([dicaprio])

In [None]:
def isSurvived(name, person):
    isSurvive = 'not survived' if tree_clf.predict([person])[0] == 0 else 'survived'
    print(name, ' is ', isSurvive, 
          ' --> ', max(tree_clf.predict_proba([person])[0]))
    
isSurvived('Dicaprio', dicaprio)
isSurvived('Winslet', winslet)