In [246]:
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings('ignore')

In [271]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## Analyze the Data

可以看到cabin的na很多，ticket的意义不大，这两个feature直接丢掉。

In [272]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [273]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [274]:
data.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

In [275]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


name 里面有个title和家族名值得提取，可能会有有用的信息。

In [276]:
data['Titles'] = [re.search(r'\b([A-Za-z]+)\.', name)[0] for name in data['Name']]
data['Family'] = [re.search(r'\b([A-Za-z]+)\,', name)[0] for name in data['Name']]

In [277]:
#data.drop(['Name'], axis=1, inplace=True)
data.drop(['Family', 'Name'], axis=1, inplace=True)

In [278]:
data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Titles
count,891.0,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891
unique,,,,2,,,,,3,17
top,,,,male,,,,,S,Mr.
freq,,,,577,,,,,644,517
mean,446.0,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,
std,257.353842,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,
min,1.0,0.0,1.0,,0.42,0.0,0.0,0.0,,
25%,223.5,0.0,2.0,,20.125,0.0,0.0,7.9104,,
50%,446.0,0.0,3.0,,28.0,0.0,0.0,14.4542,,
75%,668.5,1.0,3.0,,38.0,1.0,0.0,31.0,,


我们的特征已经处理好了，里面还有些缺失值，先不管
编码一下字符特征

In [281]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for str_feature in ['Titles', 'Sex', 'Embarked']:
    data[str_feature] = le.fit_transform(data[str_feature])

In [282]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Titles
0,1,0,3,1,22.0,1,0,7.25,2,12
1,2,1,1,0,38.0,1,0,71.2833,0,13
2,3,1,3,0,26.0,0,0,7.925,2,9
3,4,1,1,0,35.0,1,0,53.1,2,13
4,5,0,3,1,35.0,0,0,8.05,2,12


In [283]:
def data_preprocessing(data):
    data.drop(['Ticket', 'Cabin'], axis=1, inplace=True)
    data['Titles'] = [re.search(r'\b([A-Za-z]+)\.', name)[0] for name in data['Name']]
    data.drop(['Name'], axis=1, inplace=True)
    
    for str_feature in ['Titles', 'Sex', 'Embarked']:
        data[str_feature] = le.fit_transform(data[str_feature])
    
    return data

In [284]:
test = data_preprocessing(test)

# Train test split

In [285]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=22)

x = data.drop(['Survived'], axis=1)
y = data['Survived']

x_train, x_test = x.loc[data_train.index], x.loc[data_test.index]
y_train, y_test = y.loc[data_train.index], y.loc[data_test.index]

# Model Training

## LGBM

In [312]:
from lightgbm import LGBMClassifier
params = cv.best_params_
lgbm_clf = LGBMClassifier(**params, random_state=3333)
lgbm_clf.fit(x_train, y_train)

LGBMClassifier(colsample_bytree=0.7, learning_rate=0.01, min_child_samples=10,
               random_state=3333, reg_alpha=0, reg_lambda=1.0, subsample=0.7)

In [313]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = lgbm_clf.predict(x_test)
accuracy_score(y_train, lgbm_clf.predict(x_train)), accuracy_score(y_test, y_pred)

(0.8820224719101124, 0.8100558659217877)

In [314]:
roc_auc_score(y_train, lgbm_clf.predict(x_train)), roc_auc_score(y_test, y_pred)

(0.8551569918312515, 0.7806324110671937)

In [293]:
## grid search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.1],
    'min_child_samples': [10, 30],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
    'reg_alpha': [0, 1.0],
    'reg_lambda': [0, 1.0],
}

# 创建GridSearchCV对象
cv = GridSearchCV(
    estimator=lgbm_clf,
    param_grid=param_grid,
    scoring='roc_auc',  # 选择适当的评估指标
    cv=3,  # 选择适当的交叉验证折数
    n_jobs=3,
    verbose=1,
)

# 在训练集上拟合GridSearchCV对象
cv.fit(x_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


GridSearchCV(cv=3, estimator=LGBMClassifier(), n_jobs=3,
             param_grid={'colsample_bytree': [0.7, 0.9],
                         'learning_rate': [0.01, 0.1],
                         'min_child_samples': [10, 30], 'reg_alpha': [0, 1.0],
                         'reg_lambda': [0, 1.0], 'subsample': [0.7, 0.9]},
             scoring='roc_auc', verbose=1)

# Ensemble

In [None]:
lgbm_clf = LGBMClassifier(**params)
lgbm_clf.fit(x_train, y_train)

In [50]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('lgbm', lgbm_clf), ('xgb', xgb_clf)], voting='soft'
)
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lgbm', LGBMClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None, max_bin=No

In [51]:
accuracy_score(y_train, voting_clf.predict(x_train)), accuracy_score(y_test, voting_clf.predict(x_test))

(1.0, 0.8212290502793296)

# OUTPUT

In [315]:
test_result = test.copy()
test_result['Survived'] = lgbm_clf.predict(test)
test_result = test_result[['PassengerId', 'Survived']]
test_result.to_csv('test_result.csv', index=False)