# Titanic数据集特征工程
一个机器学习比赛（监督学习，典型代表：风控比赛）是什么样子的？
https://www.kaggle.com/c/titanic

In [1]:
# 前两行画图
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
seaborn.set()
# 数据处理
import numpy as np
import pandas as pd
# 系统库
import os, sys
# 自带数据
datalib_path = os.path.join(os.path.abspath('.'), '../')
sys.path.append(datalib_path)
import dataset
# model
import catboost
# 特征工程库
import category_encoders

In [2]:
# 利用泄漏出来的答案来模拟kaggle的评分
# 这样就不用每次上传了
def mock_kaggle_score(submission):
    from sklearn.metrics import accuracy_score
    answer = pd.read_csv(os.path.join(dataset.titanic_path, 'test_answer.csv')).set_index('PassengerId')
    return accuracy_score(y_true=answer.Survived, y_pred=submission.Survived)

In [3]:
# 第一步 理解题目
# 业务背景
# 数据规模
# 评价标准

In [4]:
# 第二步 数据获取
# 拿到数据，成功读入数据
# 如果数据规模很大，没法完全读到内存当中，怎么办？
# 粗暴解决：找更好的机器
# 技术解决：读取数据samples，做好preprocessing和model方案，用流水线pipeline分批处理
train = pd.read_csv(os.path.join(dataset.titanic_path, 'train.csv')).set_index('PassengerId')
test = pd.read_csv(os.path.join(dataset.titanic_path, 'test.csv')).set_index('PassengerId')
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# 第三步 数据探索（EDA），数据清洗
# 省略

In [6]:
# 第四步 生成基线性能（Baseline model）
# Baseline作为最基本的评估标准，你的任何手段（特征工程，算法，调参数）都是为了生成比Baseline更好的方案
# Baseline的要求，不要求多复杂，需要快速做好baseline，即最简单手段

# 这里用catboost，好处，不需要做特征工程就可以快速生成一个model
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin']
cat_features = ['Embarked', 'Cabin', 'Sex']
num_features = [col for col in features if col not in cat_features]

X_train = train[features].copy()
y_train = train['Survived']
X_test = test[features].copy()

# 简单类型转换，分类型特征全部变成string，避免一个列当中同时有nan和string的情况
X_train[cat_features] = X_train[cat_features].astype(str)
X_test[cat_features] = X_test[cat_features].astype(str)
# float
X_train[num_features] = X_train[num_features].astype(float)
X_test[num_features] = X_test[num_features].astype(float)

# https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier
model = catboost.CatBoostClassifier(cat_features = cat_features, random_state = 1988)
model.fit(X_train, y_train, verbose=200)
y_pred = model.predict(X_test)

# 准备submission
submission = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')
submission['Survived'] = y_pred

# 提交kaggle评估分数
baseline_score = mock_kaggle_score(submission)
print('Baseline score', baseline_score)

Learning rate set to 0.009807
0:	learn: 0.6889992	total: 57.9ms	remaining: 57.8s
200:	learn: 0.3949931	total: 519ms	remaining: 2.06s
400:	learn: 0.3603771	total: 883ms	remaining: 1.32s
600:	learn: 0.3397886	total: 1.28s	remaining: 850ms
800:	learn: 0.3207523	total: 1.68s	remaining: 417ms
999:	learn: 0.3003761	total: 2.11s	remaining: 0us
Baseline score 0.7607655502392344


In [7]:
# 第五步 特征工程 or 算法
# 该步骤是竞赛中的主要环节
# 可能需要反复调试，每次调试我们称作一个实验experiment
# 可能需要记下来每一次实验的结果来决定 实验方向

# 首先，现在我们有Baseline了 我们需要的是更好的方法来提高分布（public board）
# 这个时候，有很多种实验方向
# 1. 特征工程方向（做新特征或者对现有特征做transform）
# 2. 算法方向（选择新算法或者调参）

# 一般来说对于风控比赛来说，特征更重要一些，建议首先尝试

In [8]:
# 算法方向 示例
# 拆分训练集验证集+Early Stopping+多次建模Voting

from sklearn.model_selection import ShuffleSplit

sample = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')

predictions = sample[[]].copy()

for i, (train_index, test_index) in enumerate(ShuffleSplit(n_splits=100, test_size=0.1,random_state=1988).split(X_train)):
    X_t, y_t = X_train.iloc[train_index], y_train.iloc[train_index]
    X_v, y_v = X_train.iloc[test_index], y_train.iloc[test_index]
    
    model = catboost.CatBoostClassifier(n_estimators=1000, random_state = 1988)
    model.fit(X_t, y_t, cat_features=cat_features, verbose = False, early_stopping_rounds=20, eval_set=[(X_v, y_v)])
    
    y_pred = model.predict(X_test)
    y_pred = pd.Series(y_pred,index=X_test.index)
    
    predictions[f'Round_{i}'] = y_pred.loc[predictions.index]

predictions.head()

Unnamed: 0_level_0,Round_0,Round_1,Round_2,Round_3,Round_4,Round_5,Round_6,Round_7,Round_8,Round_9,...,Round_90,Round_91,Round_92,Round_93,Round_94,Round_95,Round_96,Round_97,Round_98,Round_99
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
893,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
896,1,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [9]:
## 接上边 做很多次预测然后来做一个Voting Classifier
pred = (predictions.sum(axis=1) >= 0.5*predictions.shape[1]).astype(int)
my_submission = sample.copy()
my_submission['Survived'] = pred

score = mock_kaggle_score(my_submission)
# 用算法方案 score 提升
print('current', score, 'baseline', baseline_score, 'percentage', (score - baseline_score)/baseline_score*100,'%')

current 0.7703349282296651 baseline 0.7607655502392344 percentage 1.2578616352201324 %


In [10]:
# 特征工程方向
# 一般方法
# 数值型特征：缩放处理（但是对于tree系列model来说，意义不大）
# 分类型特征：OneHotEncode，LabelEncode，CountEncode

# demo
train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train, test], axis=0, sort=True)
# all in one
print(train.shape, test.shape)
print(df.shape)


Xy = df[[]].copy()
Xy['y'] = df.Survived

# features
Xy['num_fe_Pclass_cnt'] = category_encoders.count.CountEncoder().fit_transform(df['Pclass'])
Xy['num_fe_Pclass'] = df['Pclass'].astype(float)
Xy['cat_fe_Sex'] = df['Sex'].copy()
Xy['num_fe_Sex'] = category_encoders.count.CountEncoder().fit_transform(df['Sex'])

def cvt_children(v):
    if v == 0:
        return 0
    elif v == 1:
        return 10
    elif v == 2:
        return 20
    else:
        return 30
    
Xy['num_fe_SibSp'] = df.SibSp.apply(cvt_children)
Xy['num_fe_SibSp_cnt'] = category_encoders.count.CountEncoder().fit_transform(Xy['num_fe_SibSp'])
Xy['cat_fe_SibSp'] = Xy['num_fe_SibSp'].astype(str)
Xy['num_fe_Parch'] = df.Parch.apply(cvt_children)
Xy['num_fe_Parch_cnt'] = category_encoders.count.CountEncoder().fit_transform(Xy['num_fe_Parch'])
Xy['cat_fe_Parch'] = Xy['num_fe_Parch'].astype(str)
Xy['num_fe_Fare'] = df['Fare']
Xy['cat_fe_Embarked'] = df['Embarked'].astype(str)
Xy['num_fe_Embarked'] = category_encoders.count.CountEncoder().fit_transform(Xy['cat_fe_Embarked'])
Xy['num_fe_Age'] = df['Age'].astype('float')

feature_columns = [col for col in Xy.columns if col.startswith('num_') or col.startswith('cat_')]
cat_columns = [col for col in feature_columns if col.startswith('cat_')]
num_columns = [col for col in feature_columns if col.startswith('num_')]

X_train, y_train = Xy[df.is_train==1][feature_columns], Xy[df.is_train==1]['y'].astype(int)
X_test = Xy[df.is_train==0][feature_columns]

# https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier
model = catboost.CatBoostClassifier(cat_features = cat_columns, random_state = 1988)
model.fit(X_train, y_train, verbose=200)
y_pred = model.predict(X_test)

# 准备submission
submission = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')
submission['Survived'] = y_pred

# 提交kaggle评估分数
score = mock_kaggle_score(submission)
# 特征方案提升 1.2%
print('current', score, 'baseline', baseline_score, 'percentage', (score - baseline_score)/baseline_score*100,'%')

# 再用算法方案
sample = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')
predictions = sample[[]].copy()
for i, (train_index, test_index) in enumerate(ShuffleSplit(n_splits=100, test_size=0.1,random_state=12384).split(X_train)):
    X_t, y_t = X_train.iloc[train_index], y_train.iloc[train_index]
    X_v, y_v = X_train.iloc[test_index], y_train.iloc[test_index]
    model = catboost.CatBoostClassifier(n_estimators=1000, random_state = 1988)
    model.fit(X_t, y_t, cat_features=cat_columns, verbose = False, early_stopping_rounds=20, eval_set=[(X_v, y_v)])
    y_pred = model.predict(X_test)
    y_pred = pd.Series(y_pred,index=X_test.index)
    predictions[f'Round_{i}'] = y_pred.loc[predictions.index]

## 接上边 做很多次预测然后来做一个Voting Classifier
pred = (predictions.sum(axis=1) >= 0.5*predictions.shape[1]).astype(int)
my_submission = sample.copy()
my_submission['Survived'] = pred
score1 = mock_kaggle_score(my_submission)
# 用算法方案 score 提升有限
print('current', score1, 'last', score, 'percentage', (score1 - score)/score*100,'%')


(891, 12) (418, 11)
(1309, 12)
Learning rate set to 0.009807
0:	learn: 0.6869937	total: 2.47ms	remaining: 2.46s
200:	learn: 0.3877593	total: 486ms	remaining: 1.93s
400:	learn: 0.3535887	total: 956ms	remaining: 1.43s
600:	learn: 0.3340879	total: 1.44s	remaining: 954ms
800:	learn: 0.3175586	total: 1.92s	remaining: 478ms
999:	learn: 0.2976561	total: 2.4s	remaining: 0us
current 0.7679425837320574 baseline 0.7607655502392344 percentage 0.943396226415092 %


In [None]:
# 以上特征工程的做法，每个竞赛的人都能想到，这个时候就需要定制特征
# 你要怎么做呢

# 特征上的思路

# 利用Text的意义
print(df['Name'])

# 名字可能有很多意思

import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str(big_string).find(substring) != -1:
            return substring
    return np.nan

def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

def create_title(df):     
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                        'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                        'Don', 'Jonkheer']
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))

    df['Title']=df.apply(replace_titles, axis=1)
    return df['Title']
    
Xy['cat_fe_title'] = create_title(df.copy())

def create_adult(df):
    df['Adult'] = np.where((df['cat_fe_title'] == 'Mr') | (df['cat_fe_title'] =='Mrs'), 1, 0)
    return df['Adult']

Xy['cat_fe_audit'] = create_adult(Xy.copy())


cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']

def create_cabin(df):
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    return df['Deck'].astype(str)
                                
deck = create_cabin(df.copy())
Xy['cat_fe_deck'] = deck

def create_family(df):
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    df.drop(columns=[ "SibSp","Parch"], inplace=True)
    return df['Family']
    
Xy['num_fe_family'] = create_family(df.copy())

# 超级特征
# https://upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Olympic_%26_Titanic_cutaway_diagram.png/402px-Olympic_%26_Titanic_cutaway_diagram.png
deck_normalized = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8}


def encode_cabin(df):
    df.replace({"Deck": deck_normalized}, inplace=True)
    df['Deck'].fillna(value=9, inplace=True)
    return df['Deck']

df['Deck'] = deck
cabin = encode_cabin(df.copy())
Xy['num_fe_cabin'] = cabin.astype(float)

feature_columns = [col for col in Xy.columns if col.startswith('num_') or col.startswith('cat_')]
cat_columns = [col for col in feature_columns if col.startswith('cat_')]
num_columns = [col for col in feature_columns if col.startswith('num_')]

X_train, y_train = Xy[df.is_train==1][feature_columns], Xy[df.is_train==1]['y'].astype(int)
X_test = Xy[df.is_train==0][feature_columns]

# https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier
model = catboost.CatBoostClassifier(cat_features = cat_columns, random_state = 1988)
model.fit(X_train, y_train, verbose=200)
y_pred = model.predict(X_test)

# 准备submission
submission = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')
submission['Survived'] = y_pred

# 提交kaggle评估分数
score2 = mock_kaggle_score(submission)
# 特征方案提升 1.2%
print('current', score2, 'last', score, 'percentage', (score2 - score)/score*100,'%')

# 再用算法方案
sample = pd.read_csv(os.path.join(dataset.titanic_path, 'gender_submission.csv')).set_index('PassengerId')
predictions = sample[[]].copy()
for i, (train_index, test_index) in enumerate(ShuffleSplit(n_splits=100, test_size=0.3,random_state=12384).split(X_train)):
    X_t, y_t = X_train.iloc[train_index], y_train.iloc[train_index]
    X_v, y_v = X_train.iloc[test_index], y_train.iloc[test_index]
    model = catboost.CatBoostClassifier(random_state = 1988)
    model.fit(X_t, y_t, cat_features=cat_columns, verbose = False, early_stopping_rounds=20, eval_set=[(X_v, y_v)])
    y_pred = model.predict(X_test)
    y_pred = pd.Series(y_pred,index=X_test.index)
    predictions[f'Round_{i}'] = y_pred.loc[predictions.index]

## 接上边 做很多次预测然后来做一个Voting Classifier
pred = (predictions.sum(axis=1) >= 0.5*predictions.shape[1]).astype(int)
my_submission = sample.copy()
my_submission['Survived'] = pred
score3 = mock_kaggle_score(my_submission)
# 用算法方案 score 提升有限
print('current', score3, 'last', score2, 'percentage', (score3 - score2)/score2*100,'%')




PassengerId
1                                 Braund, Mr. Owen Harris
2       Cumings, Mrs. John Bradley (Florence Briggs Th...
3                                  Heikkinen, Miss. Laina
4            Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                                Allen, Mr. William Henry
                              ...                        
1305                                   Spector, Mr. Woolf
1306                         Oliva y Ocana, Dona. Fermina
1307                         Saether, Mr. Simon Sivertsen
1308                                  Ware, Mr. Frederick
1309                             Peter, Master. Michael J
Name: Name, Length: 1309, dtype: object
Learning rate set to 0.009807
0:	learn: 0.6864671	total: 4.17ms	remaining: 4.16s
200:	learn: 0.3762544	total: 457ms	remaining: 1.82s
400:	learn: 0.3430383	total: 917ms	remaining: 1.37s
600:	learn: 0.3222002	total: 1.36s	remaining: 902ms
800:	learn: 0.3055121	total: 1.8s	remaining: 448ms
999:	learn: 0.2888439	