本次任务的思路是，先实现一个完整的开发过程，然后再逐步改进，提升精度

# 第一步导入数据包

In [None]:

import pandas as pd
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# train_df.loc[(  train_df['Age']<16) ,:].info()
train_df.loc[(train_df["Age"] <16) & (train_df["Sex"] == 'male') ].info()
# df2 = train_df.where(train_df['Age']<16 , train_df['Age']>4  )
# df2.info()

#  第二步观察数据

In [None]:

train_df.head()

In [None]:
#观察字段
#print(train_df.columns.values)
#print('-'*20)
#查看数据信息，类似数据报告
train_df.info()
print('-'*20)
#数据统计，（数值型字段）
train_df.describe()


In [None]:
#统计数据关联关系
print(train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False))
print('-'*20)
print(train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False))
print('-'*20)
print(train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False))
print('-'*20)
print(train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False))
# print('-'*20)
# print(train_df[['Age', 'Survived']].groupby(['Age'], as_index=False).mean().sort_values(by='Survived', ascending=False))

#print('-'*20)
#print(train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False))
#连续数值型数据比较多，通过图形查看
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:

g = sns.FacetGrid(train_df, col='Survived', height=8.2, aspect=1.6)
g.map(plt.hist, 'Age', bins=20)

# 第三步 执行一个简单的建模与评估

In [None]:
#执行一个简单的预测提交

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score 
from sklearn.naive_bayes import GaussianNB

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# model =  LogisticRegression()
# model =  GaussianNB()

y = train_df["Survived"]
#去掉id、title、票号、仓位号；"Age","Embarked","Fare"因为有缺失，不能立刻用于建模
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_df[features])


model.fit(X, y)
scores = cross_val_score(model, X, y, cv=5)
print(scores.mean())
print ('ROC AUC: %0.3f' % model.score(X, y) )



在完成一个简单的流程之后（不包括提交），接下来的任务是提高分类的准确率，通常评估的准确率用于测试数据，准确率通常会下降2-3个百分点。
提高准确率的步骤包括：缺失数据处理、连续数据转换成离散数据、数据转换

In [None]:
train_df['Sex']

In [None]:
 
import plotly.graph_objs as go 
import plotly.offline as py
#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

# Plotting age distribution vs target
S = train_df[train_df['Survived']==1]
D = train_df[train_df['Survived']==0]

def plot_distribution(var_select, title) :  
    tmp1 = S[var_select]
    tmp2 = D[var_select]
    
    trace0 = go.Histogram(
        x=tmp1, opacity=1, name='Survived', marker=dict(
        color='lightblue')
    )
    trace1 = go.Histogram(
        x=tmp2, opacity=1, name='Died', marker=dict(
        color='pink')
    )
    
    data = [trace0, trace1]
    layout = go.Layout(barmode='stack', title = title,
                  autosize = True,
                  height  = 500,
                  width   = 800,
                  #barmode = 'stack',
                  xaxis=dict(), 
                  yaxis=dict(title= 'Count'), 
                  yaxis2=dict(range= [-0, 75], 
                              overlaying= 'y', 
                              anchor= 'x', 
                              side= 'right',
                              zeroline=False,
                              showgrid= False, 
                              title= '% Died'
                             ),
                 legend=dict(x=-.1, y=1.5),
                 margin=go.layout.Margin(
                    b=0))
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig, filename = 'Density plot')

In [None]:
# print(train_df[['Name_Length', 'Survived']].groupby(['Name_Length'], as_index=False).mean().sort_values(by='Name_Length', ascending=False))
plot_distribution('Age', 'Name_Length vs Survived')
 
 

# 第四步  特征工程

In [None]:
#处理数据缺失问题，Age  、Embarked、Fare
#把两个df对象放到列表中
test_df['Type'] = 2 #用于区分test数据集
train_df['Type'] = 1 #用于区分test数据集
combine = [train_df, test_df]
df = pd.concat(combine)
# data = train.append(test)

 
#船票价格

fa = df[df["Pclass"]==3]
df['Fare'].fillna(fa['Fare'].median(), inplace = True)
#登录港口取最大的值
df.loc[(df.Embarked.isnull()) ,'Embarked'] = 'S'
#---------------------------转换name中的数据
#df['name1'] = df.Name.str.extract('([A-Za-z\s]+)\,', expand=False)
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# Replacing rare titles 
mapping = {'Mlle': 'Miss', 
           'Ms': 'Miss', 
           'Mme': 'Mrs',
           'Major': 'Other', 
           'Col': 'Other', 
           'Dr' : 'Other', 
           'Rev' : 'Other',
           'Capt': 'Other', 
           'Jonkheer': 'Royal',
           'Sir': 'Royal', 
           'Lady': 'Royal', 
           'Don': 'Royal',
           'Countess': 'Royal', 
           'Dona': 'Royal'}
df.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']
#按照身份填充缺失年龄年龄
for title in titles:
    age_to_impute = df.groupby('Title')['Age'].median()[titles.index(title)]
    df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = age_to_impute
# #船票
# df['Ticket1'] = df.Ticket.str.extract('(\S+\s)', expand=False)
# df['Ticket2'] = df.Ticket.str.extract('(\s\d+)', expand=False)

# 家庭成员数量
df['Family_Size'] = df['Parch'] + df['SibSp'] + 1
df.loc[:,'FsizeD']='Alone'
df.loc[(df['Family_Size']>1),'FsizeD']='Small'
df.loc[(df['Family_Size']>4),'FsizeD']='Big'
#增加儿童
df.loc[:,'Child']=1
df.loc[(df['Age']>=18),'Child']=0

#家族中获救情况
df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ",")[0])

DEFAULT_SURVIVAL_VALUE = 0.5
df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      df.loc[df['Family_Survival']!=0.5].shape[0])

for _, grp_df in df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(df[df['Family_Survival']!=0.5].shape[0]))

#名字长度，提交后有少量的提升
df['Name_Length'] = df['Name'].apply(lambda x:  len(x)>=32)

增加船舱信息
![船舱位置示意图](https://vignette.wikia.nocookie.net/titanic/images/f/f9/Titanic_side_plan.png/revision/latest?cb=20180322183733)

In [None]:
df['Cabin'] = df['Cabin'].fillna('Unknown')
df['Deck']= df['Cabin'].str.get(0)
sns.barplot(x="Deck", y="Survived", data=df, palette='Set3')

In [None]:
import numpy as np
#船票价格，船票价格和分类分段可以有少量的影响，大约0.002
bins = [-1, 7.91, 14.454, 31, 99, 250, np.inf]
names = ['a', 'b','c', 'd', 'e', 'f']
df['Fare_Bin'] = pd.cut(df['Fare'], bins ,labels=names)
# sns.barplot(x="Fare_Bin", y="Survived", data=df, palette='Set3')
#年龄分段
bins = [0, 15, 29,45, 90]
names = ['a', 'b','c', 'd']
df['Age_Grp'] = pd.cut(df['Age'], bins ,labels=names)
sns.barplot(x="Age_Grp", y="Survived", data=df, palette='Set3')
#转换船票等级
dict_class={1 : 'a' , 2 : 'b' , 3 : 'c' }
df['Pclass2']=df['Pclass'].map(dict_class)

In [None]:
#其他增加项，这两个对于结果影响是大约0.01
df['Fare_Per_Person']=df['Fare']/(df['Family_Size'])
df['Age_Class']=df['Age']*df['Pclass']
#船票数量,未发现有价值
Ticket_Count = dict(df['Ticket'].value_counts())
df['TicketGroup'] = df['Ticket'].apply(lambda x:Ticket_Count[x])
def Ticket_Label(s):
    if (s >= 2) & (s <= 4):
        return 2
    elif ((s > 4) & (s <= 8)) | (s == 1):
        return 1
    elif (s > 8):
        return 0
#通过分组人群，修改测试数据集，使部分人群的属性改为容易死亡的属性
df['TicketGroup'] = df['TicketGroup'].apply(Ticket_Label)
df['TicketGroup']=df['TicketGroup'].map(dict_class)
sns.barplot(x='TicketGroup', y='Survived', data=df, palette='Set3')
df['Surname']=df['Name'].apply(lambda x:x.split(',')[0].strip())
Surname_Count = dict(df['Surname'].value_counts())
df['FamilyGroup'] = df['Surname'].apply(lambda x:Surname_Count[x])
Female_Child_Group=df.loc[(df['FamilyGroup']>=2) & ((df['Age']<=12) | (df['Sex']=='female'))]
Male_Adult_Group=df.loc[(df['FamilyGroup']>=2) & (df['Age']>12) & (df['Sex']=='male')]
Female_Child_Group=Female_Child_Group.groupby('Surname')['Survived'].mean()
Dead_List=set(Female_Child_Group[Female_Child_Group.apply(lambda x:x==0)].index)
Male_Adult_List=Male_Adult_Group.groupby('Surname')['Survived'].mean()
Survived_List=set(Male_Adult_List[Male_Adult_List.apply(lambda x:x==1)].index)

 
df.loc[(df['Surname'].apply(lambda x:x in Dead_List)) & (df['Type']==2),'Sex'] = 'male'
df.loc[(df['Surname'].apply(lambda x:x in Dead_List)) & (df['Type']==2),'Age'] = 60
df.loc[(df['Surname'].apply(lambda x:x in Dead_List)) & (df['Type']==2),'Title'] = 'Mr'
df.loc[(df['Surname'].apply(lambda x:x in Survived_List)) & (df['Type']==2),'Sex'] = 'female'
df.loc[(df['Surname'].apply(lambda x:x in Survived_List)) & (df['Type']==2),'Age'] = 5
df.loc[(df['Surname'].apply(lambda x:x in Survived_List)) & (df['Type']==2),'Title'] = 'Miss'

# df.head()

# 选取特征字段，拆分训练数据和测试数据

In [None]:
df.info()

In [None]:
# df_new = df.drop(['SibSp','Parch','Name','Ticket','Cabin','Last_Name'],axis=1)
df_new = df.drop(['SibSp','Parch','Name','Ticket','Cabin','Last_Name','Fare','Age','Pclass','Family_Size'],axis=1)#
trans_df = pd.get_dummies(df_new)


# 使用sklearn 的特征选择类，效果有提升，但是不如人工判断选择字段效果好，二者非常接近

In [None]:
train_df = trans_df.loc[ trans_df['Type'] == 1, :] 
test_df = trans_df.loc[ trans_df['Type'] == 2, :] 
y = train_df["Survived"]
X = train_df.drop(['Survived','PassengerId','Type'],axis=1)

from sklearn.feature_selection import SelectKBest, chi2

X_new = SelectKBest(chi2, k=16)
X_new.fit_transform(X, y) 
cols = X_new.get_support(indices=True)

X = X.iloc[:,cols]
X.head()

 

In [None]:
print(X.columns.values)

In [None]:
# #去掉id、title、票号、仓位号；  ,"Embarked","Fare","Title"Age 
# features = ["PassengerId","Type","Survived","Pclass", "Sex","FsizeD","Family_Survival","Child","Title","Name_Length","Deck"]
# trans_df = pd.get_dummies(df[features])
# train_df = trans_df.loc[ trans_df['Type'] == 1, :] 
# test_df = trans_df.loc[ trans_df['Type'] == 2, :] 


# 训练模型，预测测试数据集，并输出结果

In [None]:
train_df = trans_df.loc[ trans_df['Type'] == 1, :] 
test_df = trans_df.loc[ trans_df['Type'] == 2, :] 
 
train_df.describe()
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression

# y = train_df["Survived"]
# X = train_df.drop(['Survived','PassengerId','Type'],axis=1)

model2 = RandomForestClassifier(n_estimators=200,min_samples_leaf=5)
# model2 = LogisticRegression()
model2.fit(X, y)
scores = cross_val_score(model2, X, y, cv=5)
print(scores.mean())
print ('ROC AUC: %0.3f' % model2.score(X, y) )
#预测测试数据，并保存
X_test = test_df.drop(['Survived','PassengerId','Type'],axis=1)[X.columns.values]
predictions = model2.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output['Survived'] = output['Survived'].apply(lambda x: int(x))
output.to_csv('my_submission5.csv', index=False)
print("Your submission was successfully saved!")

# 加入多分类器的处理方式

In [None]:
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
log = LogisticRegression(penalty = 'l2',solver = 'liblinear', C = 0.25)
xgb = XGBClassifier(learning_rate=0.01 ,
                                                              n_estimators=860,
                                                              max_depth=3,
                                                              subsample=1,
                                                              colsample_bytree=1,
                                                              gamma=6,
                                                              reg_alpha = 14,
                                                              reg_lambda = 3)
classifier = VotingClassifier(estimators=[('XGB', xgb),('log', log)])
classifier.fit(X, y)
accuracies = cross_val_score(classifier, X, y , cv = 5)
print("5 fold cross validation accuracies {}".format(accuracies.mean()))
X_test = test_df.drop(['Survived','PassengerId','Type'],axis=1)[X.columns.values]
predictions = classifier.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output['Survived'] = output['Survived'].apply(lambda x: int(x))
output.to_csv('my_submission3.csv', index=False)

# 让分类器具备采用不同特征的能力，即引入pipeline。效果是进一步提升了0.5

In [None]:
import sklearn.pipeline as pip
pipeline_xgb =pip.Pipeline(steps=[('feature_select', SelectKBest(chi2 , k = 18)),
                                  ('classifier',XGBClassifier(learning_rate=0.01 ,
                                                              n_estimators=700,
                                                              max_depth=3,
                                                              subsample=1,
                                                              colsample_bytree=1,
                                                              gamma=6,
                                                              reg_alpha = 14,
                                                              reg_lambda = 3))
                                  ])
#--------------------------
pipeline_log =pip.Pipeline(steps=[('feature_select',SelectKBest(chi2, k = 24 )),
                                  ('classifier',LogisticRegression(penalty = 'l2',
                                                                   solver = 'liblinear',
                                                                   C = 0.25))
                                  ])
classifier2 = VotingClassifier(estimators=[('XGB', pipeline_xgb),('log', pipeline_log)])
y = train_df["Survived"]
X = train_df.drop(['Survived','PassengerId','Type'],axis=1)
classifier2.fit(X, y)
accuracies = cross_val_score(classifier2, X, y , cv = 5)
print("5 fold cross validation accuracies {}".format(accuracies.mean()))
X_test = test_df.drop(['Survived','PassengerId','Type'],axis=1)
predictions = classifier2.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output['Survived'] = output['Survived'].apply(lambda x: int(x))
output.to_csv('my_submission2.csv', index=False)

搜索最佳参数

In [None]:
from sklearn.model_selection import GridSearchCV
pipe=pip.Pipeline([('select',SelectKBest(k=20)), 
               ('classify', RandomForestClassifier(random_state = 10, max_features = 'sqrt'))])
param_test = {
              'classifier__C':[0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]
             }

param_test_xgb = {
#                'classifier__max_depth':list(range(2,12,1)) 
#     'classifier__n_estimators':list(range(100,500,50)) 
    
               'classifier__colsample_bytree':[0.8,0.9,1] 
#                'classifier__colsample_bytree':list(range(1,5,1)) ,
#                'classifier__gamma':list(range(3,9,1)) ,
             } 
gsearch = GridSearchCV(estimator = pipeline_xgb, param_grid = param_test_xgb, scoring='accuracy', cv=5, verbose=10)
gsearch.fit(X, y)
print('randomforest',gsearch.best_params_, gsearch.best_score_)