In [17]:
import numpy as np
import pandas as pd
import pickle

In [16]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 2 ** 20
    print(f"Memory usage of initial dataframe is {start_mem:.2f} MiB.")

    for col in df.columns:
        col_type = df[col].dtype
        if col_type == object:
            continue

        col_min = df[col].min()
        col_max = df[col].max()
        if str(col_type)[:3] == "int":
            if np.iinfo(np.int8).min < col_min and col_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif np.iinfo(np.int16).min < col_min and col_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif np.iinfo(np.int32).min < col_min and col_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif np.iinfo(np.int64).min < col_min and col_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
            else:
                raise Exception("too large")
        else:  # assert float
            if np.finfo(np.float16).min < col_min and col_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif np.finfo(np.float32).min < col_min and col_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            elif np.finfo(np.float64).min < col_min and col_max < np.finfo(np.float64).max:
                df[col] = df[col].astype(np.float64)
            else:
                raise Exception("too large")

    end_mem = df.memory_usage().sum() / 2 ** 20
    print(f"Memory usage of the dataframe after optimization is {end_mem:.2f} MiB.")
    d_percent = (start_mem - end_mem) / start_mem * 100
    print(f"Decreased by {d_percent:.1f}%.")
    return df

In [3]:
drop_columns = ["Name", "Ticket", "Cabin"]

In [9]:
df_train = pd.read_csv("data/train.csv")
df_train = df_train.drop(columns=drop_columns)
df_train = reduce_mem_usage(df_train)
df_train.head()

Memory usage of initial dataframe is 0.06 MiB.
Memory usage of the dataframe after optimization is 0.02 MiB.
Decreased by 63.8%.


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.3125,C
2,3,1,3,female,26.0,0,0,7.925781,S
3,4,1,1,female,35.0,1,0,53.09375,S
4,5,0,3,male,35.0,0,0,8.046875,S


In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int16  
 1   Survived     891 non-null    int8   
 2   Pclass       891 non-null    int8   
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float16
 5   SibSp        891 non-null    int8   
 6   Parch        891 non-null    int8   
 7   Fare         891 non-null    float16
 8   Embarked     889 non-null    object 
dtypes: float16(2), int16(1), int8(4), object(2)
memory usage: 22.7+ KB


In [11]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.6875,0.523008,0.381594,32.1875
std,257.353842,0.486592,0.836071,14.523438,1.102743,0.806057,inf
min,1.0,0.0,1.0,0.419922,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.910156
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.453125
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.5


# make dataset

In [12]:
x_train = df_train.drop(columns=["Survived", "PassengerId"])
y_train = df_train["Survived"]
id_train = df_train[["PassengerId"]]

In [13]:
print(y_train.mean())

0.3838383838383838


In [30]:
x_train = pd.get_dummies(x_train, columns=["Embarked", "Sex"])

In [20]:
# categorical_features = ["Embarked", "Pclass", "Sex"]

# train

In [43]:
def train_lgb(input_x, input_y, input_id, params, list_n_fold, n_splits):
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))

    for n_fold in list_n_fold:
        print("-" * 20, n_fold, "-" * 20)
        idx_tr, idx_va = cv[n_fold][0], cv[n_fold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        lgb_tr = lgb.Dataset(x_tr, y_tr)
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        lgb_va = lgb.Dataset(x_va, y_va, reference=lgb_tr)
        model = lgb.train(params, lgb_tr, valid_sets=[lgb_tr, lgb_va], verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10)
        with open(f"model/model_lgb_fold{n_fold}.pickle", "wb") as f:
            pickle.dump(model, f, protocol=4)

In [44]:
params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "n_estimators": 100000,
    "random_state": 123,
    "importance_type": "gain",
}

train_lgb(
    x_train, y_train, id_train, params, list_n_fold=[0, 1, 2, 3, 4], n_splits=5
)

-------------------- 0 --------------------
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 209
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's auc: 0.918229	valid_1's auc: 0.819829
Early stopping, best iteration is:
[2]	training's auc: 0.910478	valid_1's auc: 0.824242
-------------------- 1 --------------------
[LightGBM] [Info] Number of positive: 274, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points 



[10]	training's auc: 0.910164	valid_1's auc: 0.873596
Early stopping, best iteration is:
[3]	training's auc: 0.904685	valid_1's auc: 0.880949
-------------------- 2 --------------------
[LightGBM] [Info] Number of positive: 274, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
Training until validation scores don't improve for 10 rounds
[10]	training's auc: 0.913797	valid_1's auc: 0.876671




[20]	training's auc: 0.922622	valid_1's auc: 0.886698
[30]	training's auc: 0.929763	valid_1's auc: 0.891444
[40]	training's auc: 0.935591	valid_1's auc: 0.893783
[50]	training's auc: 0.942982	valid_1's auc: 0.893783
[60]	training's auc: 0.949886	valid_1's auc: 0.895722
Early stopping, best iteration is:
[57]	training's auc: 0.948531	valid_1's auc: 0.896658
-------------------- 3 --------------------
[LightGBM] [Info] Number of positive: 274, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 209
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
Training until validation scores don't improve for 10 rounds




[10]	training's auc: 0.919617	valid_1's auc: 0.864906
[20]	training's auc: 0.926563	valid_1's auc: 0.864639
Early stopping, best iteration is:
[10]	training's auc: 0.919617	valid_1's auc: 0.864906
-------------------- 4 --------------------
[LightGBM] [Info] Number of positive: 273, number of negative: 440
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303
Training until validation scores don't improve for 10 rounds




[10]	training's auc: 0.918985	valid_1's auc: 0.844901
[20]	training's auc: 0.927377	valid_1's auc: 0.849289
[30]	training's auc: 0.934341	valid_1's auc: 0.861189
[40]	training's auc: 0.942012	valid_1's auc: 0.861322
Early stopping, best iteration is:
[37]	training's auc: 0.939889	valid_1's auc: 0.864513


# prediction

In [45]:
df_test = pd.read_csv("data/test.csv")
df_test = df_test.drop(columns=drop_columns)
df_test = reduce_mem_usage(df_test)
df_test.head()

Memory usage of initial dataframe is 0.03 MiB.
Memory usage of the dataframe after optimization is 0.01 MiB.
Decreased by 60.6%.


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.828125,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.664062,S
4,896,3,female,22.0,1,1,12.289062,S


In [46]:
x_test = df_test.drop(columns=["PassengerId"])
id_test = df_test[["PassengerId"]]

In [47]:
x_test = pd.get_dummies(x_test, columns=["Embarked", "Sex"])

In [61]:
def predict_lgb(input_x, input_id, list_n_fold):
    pred = np.zeros((len(input_x), len(list_n_fold)))
    for n_fold in list_n_fold:
        print("-" * 20, n_fold, "-" * 20)
        with open(f"model/model_lgb_fold{n_fold}.pickle", "rb") as f:
            model = pickle.load(f)
        pred[:, n_fold] = model.predict(input_x)
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)})
    ], axis=1)
    print("Done.")
    return pred

In [66]:
test_pred = predict_lgb(x_test, id_test, [0, 1, 2, 3, 4])
df_submit = test_pred.copy()
df_submit["Survived"] = (df_submit["pred"] > 0.4).astype(int)

df_submit = df_submit.drop(columns=["pred"])
print(df_submit.shape)
print(df_submit.head())

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.
(418, 2)
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         0


In [67]:
df_submit.to_csv("result/submission_baseline.csv", index=None)