## open file

[kaggle data link](https://www.kaggle.com/competitions/spaceship-titanic)

In [1]:
import pandas as pd
import warnings
import os
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
folder_path = '/content/drive/MyDrive/'
data_path = os.path.join(folder_path,'kaggle','spaceship-titanic','data')
train_path = os.path.join(data_path,'finish_train.csv')
test_path = os.path.join(data_path,'finish_test.csv')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

## models

In [4]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process, model_selection
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb

## train model

train/test split   = 8 : 2

In [5]:
X_train = train[train.columns.difference(['Transported'])].values
y_train = train['Transported'].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [6]:
logit_model = linear_model.LogisticRegressionCV()
sgd_model = linear_model.SGDClassifier()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgbm_model = lgb.LGBMClassifier()

logit_model.fit(X_train, y_train)
sgd_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

y_pred = logit_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = rf_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = lgbm_model.predict(X_test)
print(classification_report(y_test,y_pred))

[LightGBM] [Info] Number of positive: 3510, number of negative: 3444
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2168
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504745 -> initscore=0.018982
[LightGBM] [Info] Start training from score 0.018982
              precision    recall  f1-score   support

           0       0.86      0.64      0.73       871
           1       0.71      0.90      0.79       868

    accuracy                           0.77      1739
   macro avg       0.79      0.77      0.76      1739
weighted avg       0.79      0.77      0.76      1739

              precision    recall  f1-score   support

           0       0.91      0.16      0.27       871
           1       0.54      0.98      0.70       868

    accuracy                           0.57      1739
   mac

In [7]:
logit_model = linear_model.LogisticRegressionCV()
sgd_model = linear_model.SGDClassifier()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgbm_model = lgb.LGBMClassifier()

models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('sgd',sgd_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)],
        voting='hard')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
print(classification_report(y_test,y_pred))

models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)], # sgd 沒辦法算機率
        voting='soft')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
print(classification_report(y_test,y_pred))

[LightGBM] [Info] Number of positive: 3510, number of negative: 3444
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2168
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504745 -> initscore=0.018982
[LightGBM] [Info] Start training from score 0.018982
              precision    recall  f1-score   support

           0       0.83      0.75      0.79       871
           1       0.77      0.85      0.81       868

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739

[LightGBM] [Info] Number of positive: 3510, number of negative: 3444
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2168

## test model

In [None]:
X_train = train[train.columns.difference(['Transported'])].values
y_train = train['Transported'].astype(int).values
X_test = test[test.columns.difference(['Transported'])].values

## top5 model

In [None]:
MLA_top5 = [
    linear_model.LogisticRegressionCV(),
    linear_model.SGDClassifier(),
    ensemble.RandomForestClassifier(),
    xgb.XGBClassifier(),
    lgb.LGBMClassifier()
    ]

grid_param = [
        [{
      #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
      'fit_intercept': [True, False], #default: True
      'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
      'random_state': [0]
        }],
        [{
      #SGDC;assifier
      'loss':['hinge', 'log_loss', 'modified_huber'],
      'random_state': [0]
        }],
        [{
      #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
      'n_estimators': [10, 50, 100, 300], #default=10
      'criterion': ['gini', 'entropy'], #default=”gini”
      'max_depth': [2, 4, 6, 8, 10, None], #default=None
      'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
      'random_state': [0]
        }],
        [{
      #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
      'learning_rate': [.01, .03, .05, .1, .25], #default: .3
      'max_depth': [1,2,4,6,8,10], #default 2
      'n_estimators': [10, 50, 100, 300],
      'seed': [0]
        }],

        [{}]
        ]

In [None]:
cv_split = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)

best_param_list = []
for clf, param in zip (MLA_top5, grid_param):
  MLA_name = clf.__class__.__name__
  best_search = model_selection.GridSearchCV(estimator = clf, param_grid = param, cv = cv_split, scoring = 'roc_auc')
  best_search.fit(X_train, y_train)
  best_param = best_search.best_params_
  print(MLA_name)
  print(best_param)
  best_param_list.append(best_param)

In [None]:
import os
os.mkdir('answer')

In [None]:
models = []
for model, params in zip(MLA_top5, best_param_list):
  model.set_params(**params)
  models.append(model)

In [None]:
for model in models:
  model_name = model.__class__.__name__
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  answer = pd.concat([test['PassengerId'], pd.Series(y_pred, name='Transported')], axis=1)
  answer.replace({0: False, 1: True}, inplace=True)
  answer.to_csv('answer/'+model_name+'.csv', index=False)
  print(model_name)

LogisticRegressionCV
SGDClassifier
RandomForestClassifier
XGBClassifier
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2186
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
LGBMClassifier


In [None]:
models_vote = VotingClassifier(
        estimators=[('logit',models[0]),('sdg',models[1]),('rfm',models[2]),('xgb',models[3]),('lgbm',models[4])],
        voting='hard')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
answer = pd.concat([test['PassengerId'], pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/models_hard_vote.csv', index=False)

models_vote = VotingClassifier(
        estimators=[('logit',models[0]),('rfm',models[2]),('xgb',models[3]),('lgbm',models[4])],  # sgd 沒辦法算機率
        voting='soft')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
answer = pd.concat([test['PassengerId'], pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/models_soft_vote.csv', index=False)

[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2186
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2186
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
