## 决策树的直接调用与Titanic数据集的探索 

In [2]:
# 必要的引入
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### 读取数据并打印基本信息

In [1]:
data = pd.read_csv(os.path.join("../input/titanic-subset", "titanic.csv"), sep=",")

In [None]:
data.info()

In [None]:
data.head(3)

#### 预测目标的基本分布

In [None]:
data['survived'].value_counts(normalize=True)

In [None]:
sns.countplot(data['survived'])

#### 舱位与预测目标的关系

In [None]:
sns.countplot(data['pclass'], hue=data['survived'])

#### 名字的信息

In [None]:
data['name'].head()

#### 对名字属性进行变换  
- 取名字的title

In [None]:
data['name_title'] = data['name'].apply(lambda x: x.split(',')[1] if len(x.split(',')) > 1 else x.split(',')[0]).apply(lambda x: x.split()[0])

In [None]:
data['name_title'].value_counts()

#### 名字的title与存活与否的关系

In [None]:
data['survived'].groupby(data['name_title']).mean()

#### 取名字的长度

In [None]:
data['name_len'] = data['name'].apply(lambda x: len(x))

#### 名字长度与存活与否的关系

In [None]:
data['survived'].groupby(pd.qcut(data['name_len'], 5)).mean()

#### 性别的分布与最后幸存的关系

In [None]:
data['sex'].value_counts(normalize=True)

In [None]:
data['survived'].groupby(data['sex']).mean()

#### 年龄与幸存的关系  
- 缺失数据的处理  
  1 实值： 中位数或者平均数去补  
  2 类别： major class去补

In [None]:
data['survived'].groupby(pd.qcut(data['age'], 5)).mean()

#### 登船的地点与幸存的关系

In [None]:
data['embarked'].value_counts()

In [None]:
data['survived'].groupby(data['embarked']).mean()

In [None]:
sns.countplot(data['embarked'], hue=data['pclass'])

#### 目的地与幸存的关系

In [None]:
data['survived'].groupby(data['home.dest'].apply(lambda x: str(x).split(',')[-1])).mean()

#### room, ticket, boat缺失数据太多，舍弃不用

1. ### 新来了一个小鲜肉，基本信息如下    
![alt text](https://i.pinimg.com/originals/0b/d0/17/0bd017358bd52c945606a719615b8775.jpg)
[jack_info](http://jamescameronstitanic.wikia.com/wiki/Jack_Dawson)


#### Feature Transform

In [None]:
def name(data):
    data['name_len'] = data['name'].apply(lambda x: len(x))
    data['name_title'] = data['name'].apply(lambda x: x.split(',')[1] if len(x.split(',')) > 1 else x.split(',')[0]).apply(lambda x: x.split()[0])
    del data['name']
    return data

def age(data):
    data['age_flag'] = data['age'].apply(lambda x: 1 if pd.isnull(x) else 0)
    grouped_age = data.groupby(['name_title', 'pclass'])['age']
    data['age'] = grouped_age.transform(lambda x: x.fillna(data['age'].mean()) if pd.isnull(x.mean()) else x.fillna(x.mean()))
    return data

def embark(data):
    data['embarked'] = data['embarked'].fillna('Southampton')
    return data


def dummies(data, columns=['pclass','name_title','embarked', 'sex']):
    for col in columns:
        data[col] = data[col].apply(lambda x: str(x))
        new_cols = [col + '_' + i for i in data[col].unique()]
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)[new_cols]], axis=1)
        del data[col]
    return data

#### 预处理输入数据  
- 去掉不需要的特征  
- 对某些特征进行变换

In [None]:
drop_columns = ['row.names', 'home.dest', 'room', 'ticket', 'boat'] #+ ['ticket_len', 'ticket_title']
data = data.drop(drop_columns, axis=1)
data.head()

In [None]:
data = name(data)
data = age(data)
data = embark(data)
data = dummies(data)
data.head()

####  调用决策树模型并预测结果

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
trainX, testX, trainY, testY = train_test_split(data.iloc[:,1:], data.iloc[:,0], test_size=0.2, random_state=33)

model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
model.fit(trainX, trainY)

In [None]:
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y, y_pred), "\n")
    
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y, y_pred), "\n")

In [None]:
measure_performance(testX, testY, model)

#### 不做特征分析直接调用决策树模型

In [None]:
sub_columns = ['age', 'sex_male','sex_female']
sub_trainX = trainX[sub_columns]
sub_testX = testX[sub_columns]
sub_model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
sub_model.fit(sub_trainX, trainY)

In [None]:
measure_performance(sub_testX, testY, sub_model)

#### 可视化决策树

In [None]:
import graphviz
dot_data = tree.export_graphviz(model, out_file=None, feature_names=trainX.columns) 
graph = graphviz.Source(dot_data) 
#graph.render("titanic") 
#graph.view()

#### display graph inline

In [None]:
graph

#### 展示特征的重要性

In [None]:
pd.concat([pd.DataFrame(trainX.iloc[:,1:].columns, columns=['variable']),
         pd.DataFrame(model.feature_importances_, columns=['importance'])],
         axis=1).sort_values(by='importance', ascending=False)[:20]