In [1]:
"""
PassengerId：用户id
survival：是否生还，0-否，1-是
pclass：舱位，1-头等舱，2-二等，3-三等
name：姓名
sex：性别
Age：年龄
sibsp：在船上的兄弟/配偶数
parch：在船上父母/孩子数
ticket：票号
fare：票价
cabin：Cabin number；客舱号
embarked：登船地点
"""

'\nPassengerId：用户id\nsurvival：是否生还，0-否，1-是\npclass：舱位，1-头等舱，2-二等，3-三等\nname：姓名\nsex：性别\nAge：年龄\nsibsp：在船上的兄弟/配偶数\nparch：在船上父母/孩子数\nticket：票号\nfare：票价\ncabin：Cabin number；客舱号\nembarked：登船地点\n'

In [2]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train_data['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
full_data = pd.concat([train_data, test_data], ignore_index=True)

In [8]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [9]:
# 1.填充缺失值
# Embarked登船地点
print(full_data[full_data['Embarked'].isnull()])  # 查看缺失值对应的数据

     PassengerId  Survived  Pclass                                       Name  \
61            62       1.0       1                        Icard, Miss. Amelie   
829          830       1.0       1  Stone, Mrs. George Nelson (Martha Evelyn)   

        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  
61   female  38.0      0      0  113572  80.0   B28      NaN  
829  female  62.0      0      0  113572  80.0   B28      NaN  


In [10]:
full_data['Embarked'] = full_data.Embarked.fillna('C')  # Pclass=1，Fare=80，Embarked=C

In [11]:
# Age年龄
full_data['Age'] = full_data.Age.fillna(full_data.Age.mean())  # 用平均值填充
# Fare票价
full_data['Fare'] = full_data.Fare.fillna(full_data.Fare.mean())  # 用平均值填充

In [12]:
# 2.生成新特征
# 生成Title特征
full_data['Title'] = full_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [13]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir": "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess": "Royalty",
    "Dona": "Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Royalty"
}

In [14]:
full_data['Title'] = full_data['Title'].map(Title_Dictionary)

In [15]:
print(full_data['Title'].value_counts())

Title
Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: count, dtype: int64


In [16]:
# 生成FamilySize特征
full_data['familyNum'] = full_data['Parch'] + full_data['SibSp'] + 1

In [17]:
# 我们按照家庭成员人数多少，将家庭规模分为“小、中、大”三类：
def family_size(family_num):
    if family_num == 1:
        return 0
    elif (family_num >= 2) & (family_num <= 4):
        return 1
    else:
        return 2

In [18]:
full_data['familySize'] = full_data['familyNum'].map(family_size)

In [19]:
print(full_data['familySize'].value_counts())

familySize
0    790
1    437
2     82
Name: count, dtype: int64


In [20]:
# 取Cabin首字符作为相关特征
full_data['Cabin'] = full_data.Cabin.fillna('U')
full_data['Cabin'] = full_data['Cabin'].map(lambda c: c[0])

In [21]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        1309 non-null   object 
 11  Embarked     1309 non-null   object 
 12  Title        1309 non-null   object 
 13  familyNum    1309 non-null   int64  
 14  familySize   1309 non-null   int64  
dtypes: float64(3), int64(6), object(6)
memory usage: 153.5+ KB


In [22]:
full_data.to_csv("/kaggle/working/prepare_data.csv")

In [23]:
full_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,familyNum,familySize
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,U,S,Mr,2,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,Mrs,2,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,U,S,Miss,1,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,Mrs,2,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,U,S,Mr,1,0


In [24]:
# 3.删除无用特征
test_new_data_with_id = full_data.iloc[891:, :].copy()
full_data = full_data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'familyNum'], axis=1)

In [25]:
full_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,familySize
0,0.0,3,male,22.0,7.25,U,S,Mr,1
1,1.0,1,female,38.0,71.2833,C,C,Mrs,1
2,1.0,3,female,26.0,7.925,U,S,Miss,0
3,1.0,1,female,35.0,53.1,C,S,Mrs,1
4,0.0,3,male,35.0,8.05,U,S,Mr,0


In [26]:
# 划分训练集和测试集
train_new_data = full_data.iloc[:891, :]
test_new_data = full_data.iloc[891:, :]
train_x = train_new_data.drop(['Survived'], axis=1)
train_y = train_new_data['Survived']
test_x = test_new_data.drop(['Survived'], axis=1)

In [27]:
# 创建OrdinalEncoder实例
encoder = preprocessing.OrdinalEncoder()
# 创建StandardScaler实例
scaler = preprocessing.StandardScaler()
# 定义需要进行编码的列
columns_to_encode = ['Sex', 'Embarked', 'Cabin', 'Title']

In [28]:
# 创建ColumnTransformer实例，用于对指定列进行转换
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', encoder, columns_to_encode)
    ],
    remainder='passthrough'  # 保留未指定的列
)

In [29]:
train_x_encoded = column_transformer.fit_transform(train_x)
test_x_encoded = column_transformer.transform(test_x)

In [30]:
train_x_encoded_std = scaler.fit_transform(train_x_encoded)
test_x_encoded_std = scaler.transform(test_x_encoded)

In [31]:
train_x_encoded_std[0]

array([ 0.73769513,  0.58937471,  0.52206745,  0.11966255,  0.82737724,
       -0.59525389, -0.50244517,  0.8557739 ])

In [32]:
test_x_encoded_std[0]

array([ 0.73769513, -0.66993673,  0.52206745,  0.11966255,  0.82737724,
        0.36666048, -0.49078316, -0.74947777])

In [33]:
# 4.模型训练
# 设置kfold，交叉采样法拆分数据集
kfold = StratifiedKFold(n_splits=10)

In [34]:
# 汇总不同模型算法
classifiers = [SVC(), DecisionTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(),
               GradientBoostingClassifier(), KNeighborsClassifier(), LogisticRegression(), LinearDiscriminantAnalysis()]

In [35]:
# 不同机器学习交叉验证结果汇总
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, train_x_encoded_std, train_y,
                                      scoring='accuracy', cv=kfold, n_jobs=-1))

In [36]:
# 求出模型得分的均值和标准差
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [37]:
# 汇总数据
cvResDf = pd.DataFrame({'cv_mean': cv_means,
                        'cv_std': cv_std,
                        'algorithm': ['SVC', 'DecisionTreeCla', 'RandomForestCla', 'ExtraTreesCla',
                                      'GradientBoostingCla', 'KNN', 'LR', 'LinearDiscrimiAna']})
print(cvResDf)

    cv_mean    cv_std            algorithm
0  0.836142  0.037652                  SVC
1  0.793583  0.057081      DecisionTreeCla
2  0.824969  0.037009      RandomForestCla
3  0.808127  0.040376        ExtraTreesCla
4  0.842921  0.043580  GradientBoostingCla
5  0.823820  0.035747                  KNN
6  0.786729  0.017690                   LR
7  0.796829  0.028353    LinearDiscrimiAna


In [38]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss': ['exponential', 'log_loss'],
                 'n_estimators': [100, 200, 300],
                 'learning_rate': [0.1, 0.05, 0.01],
                 'max_depth': [4, 8],
                 'min_samples_leaf': [100, 150],
                 'max_features': [0.3, 0.1]
                 }
model_GBC = GridSearchCV(GBC, param_grid=gb_param_grid, cv=kfold,
                         scoring="accuracy")
model_GBC.fit(train_x_encoded_std, train_y)
print(f"GradientBoostingClassifier模型得分：{model_GBC.best_score_}")
print(f"GradientBoostingClassifier最优参数：{model_GBC.best_params_}")

GradientBoostingClassifier模型得分：0.826067415730337
GradientBoostingClassifier最优参数：{'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 8, 'max_features': 0.3, 'min_samples_leaf': 100, 'n_estimators': 300}


In [39]:
best_model = model_GBC.best_estimator_
pred_test_y = best_model.predict(test_x_encoded_std)

In [40]:
output = pd.DataFrame(
    {"PassengerId": test_new_data_with_id["PassengerId"], "Survived": pred_test_y.astype("int64")}
)

In [41]:
output.to_csv("/kaggle/working/titanic_gbc_submission.csv", index=False)