## open file

[kaggle data link](https://www.kaggle.com/competitions/spaceship-titanic)

In [7]:
import pandas as pd
import warnings
import os
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
folder_path = '/content/drive/MyDrive/'
data_path = os.path.join(folder_path,'kaggle','spaceship-titanic','data')
train_path = os.path.join(data_path,'finish_train.csv')
test_path = os.path.join(data_path,'finish_test.csv')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

## models

In [4]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process, model_selection
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb

## train model

train/test split   = 8 : 2

In [5]:
X_train = train[train.columns.difference(['Transported'])].values
y_train = train['Transported'].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
logit_model = linear_model.LogisticRegressionCV()
sgd_model = linear_model.SGDClassifier()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgbm_model = lgb.LGBMClassifier()

logit_model.fit(X_train, y_train)
sgd_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

y_pred = logit_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = rf_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test,y_pred))
y_pred = lgbm_model.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
logit_model = linear_model.LogisticRegressionCV()
sgd_model = linear_model.SGDClassifier()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
lgbm_model = lgb.LGBMClassifier()

models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('sgd',sgd_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)],
        voting='hard')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
print(classification_report(y_test,y_pred))

models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)], # sgd 沒辦法算機率
        voting='soft')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
print(classification_report(y_test,y_pred))

## test model

In [None]:
X_train = train[train.columns.difference(['Transported'])].values
y_train = train['Transported'].astype(int).values
X_test = test[test.columns.difference(['Transported'])].values

In [None]:
import os
os.mkdir('model')

In [None]:
id = test['PassengerId']
logit_model = linear_model.LogisticRegressionCV()
logit_model.fit(X_train, y_train)
y_pred = logit_model.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/logit_model.csv', index=False)

sgd_model = linear_model.SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/sgd_model.csv', index=False)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/random_forest.csv', index=False)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/xgb_model.csv', index=False)

lgbm_model = lgb.LGBMClassifier()
lgbm_model.fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/lgbm_model.csv', index=False)

In [None]:
models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('sgd',sgd_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)],
        voting='hard')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/models_hard_vote.csv', index=False)

models_vote = VotingClassifier(
        estimators=[('logit',logit_model),('rf',rf_model),('xgb',xgb_model),('lgbm',lgbm_model)], # sgd 沒辦法算機率
        voting='soft')
models_vote.fit(X_train, y_train)
y_pred = models_vote.predict(X_test)
answer = pd.concat([id, pd.Series(y_pred, name='Transported')], axis=1)
answer.replace({0: False, 1: True}, inplace=True)
answer.to_csv('model/models_soft_vote.csv', index=False)