In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.1 Data loading

In [None]:
os.chdir('/kaggle/input/spaceship-titanic')
space_data_tr = pd.read_csv('train.csv')
space_data_te = pd.read_csv('test.csv')

In [None]:
space_data_tr

In [None]:
space_data_te

# 1.2 Check missing values

In [None]:
print(space_data_tr.isnull().sum()) 
print(space_data_te.isnull().sum())

# 2.1 Feature split

In [None]:
def feature_split(df):
    df['Deck'] = df['Cabin'].str.split('/', expand = True)[0]
    df['Num'] = df['Cabin'].str.split('/', expand = True)[1]
    df['Side'] = df['Cabin'].str.split('/', expand = True)[2]
    df['GroupId'] = df['PassengerId'].str[:4]
    df['MemberId'] = df['PassengerId'].str[5:]
    
    return df

space_data_tr = feature_split(space_data_tr)
space_data_te = feature_split(space_data_te)

# 2.2 Feature drop

In [None]:
def feature_drop(df):
    features = ['Cabin', 'PassengerId', 'Name']
    for feature in features:
        df.drop(feature, axis = 1, inplace = True)
    return df

space_data_tr = feature_drop(space_data_tr)
space_data_te = feature_drop(space_data_te)

In [None]:
space_data_tr

In [None]:
space_data_te

# 2.3 Fill missing values

In [None]:
print(space_data_tr['HomePlanet'].isnull().sum())
print(space_data_te['HomePlanet'].isnull().sum())

In [None]:
def fill_values(df):
    df.loc[df['Deck'].str[:1] == 'A', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'B', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'C', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'G', 'HomePlanet'] = 'Earth'
    
    return df

space_data_tr = fill_values(space_data_tr)
space_data_te = fill_values(space_data_te)

In [None]:
print(space_data_tr['HomePlanet'].isnull().sum())
print(space_data_te['HomePlanet'].isnull().sum())

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

# 2.4 fill NaN

In [None]:
def fill_0(df):
    features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for feature in features:
        df[feature].fillna(0, inplace = True)
    return df

space_data_tr = fill_0(space_data_tr)
space_data_te = fill_0(space_data_te)

In [None]:
space_data_tr['TotalBills'] = space_data_tr['RoomService']+space_data_tr['FoodCourt']+space_data_tr['ShoppingMall']\
                              +space_data_tr['Spa']+space_data_tr['VRDeck']
space_data_te['TotalBills'] = space_data_te['RoomService']+space_data_te['FoodCourt']+space_data_te['ShoppingMall']\
                              +space_data_te['Spa']+space_data_te['VRDeck']

In [None]:
space_data_tr['VIP'].isnull().sum()

In [None]:
space_data_tr.loc[space_data_tr['TotalBills'] == 0, 'VIP'] = False
space_data_te.loc[space_data_te['TotalBills'] == 0, 'VIP'] = False

In [None]:
space_data_tr['VIP'].isnull().sum()

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'most_frequent')

result = imputer.fit_transform(space_data_tr[['HomePlanet', 'Destination', 'Deck', 'Side']])
result1 = imputer.fit_transform(space_data_te[['HomePlanet', 'Destination', 'Deck', 'Side']])

space_data_tr[['HomePlanet', 'Destination', 'Deck', 'Side']] = result
space_data_te[['HomePlanet', 'Destination', 'Deck', 'Side']] = result1


In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
imputer = SimpleImputer(strategy = 'mean')

result2 = imputer.fit_transform(space_data_tr[['Age', 'Num']])
result3 = imputer.fit_transform(space_data_te[['Age', 'Num']])

space_data_tr[['Age', 'Num']] = result2
space_data_te[['Age', 'Num']] = result3

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
space_data_tr['CryoSleep'].fillna('N', inplace = True)
space_data_tr['VIP'].fillna('N', inplace = True)

space_data_te['CryoSleep'].fillna('N', inplace = True)
space_data_te['VIP'].fillna('N', inplace = True)

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
space_data_tr.head()

# 2.5 One-Hot Encoding

In [None]:
def Label_dummies(df):
    features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'MemberId']
    for feature in features:
         df = pd.get_dummies(df, columns = [feature])
    return df

space_data_tr = Label_dummies(space_data_tr)
space_data_te = Label_dummies(space_data_te)

In [None]:
space_data_tr.head()

# 2.6 Feature, Label split

In [None]:
X_train = space_data_tr.drop('Transported', axis = 1, inplace = False)
y_train = space_data_tr['Transported']
X_test = space_data_te

In [None]:
# # feature importance using random forest
# from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=80, max_features='auto')
# rf.fit(X_train, y_train)
# print('Training done using Random Forest')

# ranking = np.argsort(-rf.feature_importances_)
# f, ax = plt.subplots(figsize=(11, 9))
# sns.barplot(x=rf.feature_importances_[ranking], y=X_train.columns.values[ranking], orient='h')
# ax.set_xlabel("feature importance")
# plt.tight_layout()
# plt.show()

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_space_data, y_space_data, test_size = 0.2, random_state = 0)

# 3. Algorithm Selection

In [None]:
pip install xgboost

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

label = LabelEncoder()

X_train['GroupId'] = label.fit_transform(X_train['GroupId'])
X_test['GroupId'] = label.fit_transform(X_test['GroupId'])


lgbm = LGBMClassifier()
lgbm_cv = GridSearchCV(lgbm, param_grid = {'n_estimators' : [2000], 'max_depth' : [10], 'learning_rate' : [0.0001]}, verbose = 1)

lgbm_cv.fit(X_train, y_train)
# lgbm.fit(X_train, y_train)
# pred = lgbm.predict(X_train)
# print(accuracy_score(y_train, pred))


print(lgbm_cv.best_score_)
print(lgbm_cv.best_params_)

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.svm import SVC

# Kfold = StratifiedKFold(n_splits = 10)

# svcc = SVC(probability = True)
# svc_param_grid = {'kernel' : ['rbf'], 'gamma' : [0.001, 0.01, 0.1, 1], 'C' : [1, 10, 50, 100, 200, 300, 1000]}

# svcc_cv = GridSearchCV(svcc, param_grid = svc_param_grid, cv = Kfold, scoring = 'accuracy', n_jobs = 4, verbose = 1)

# svcc_cv.fit(X_train, y_train)

# svcc_best = svcc_cv.best_estimator_

# svcc_cv.best_estimator_

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

kfold = StratifiedKFold(n_splits = 5)

rf_clf = RandomForestClassifier()

rf_param_grid = {"max_depth": [None],
              "max_features": [10],
              "min_samples_split": [10],
              "min_samples_leaf": [10],
              "bootstrap": [False],
              "n_estimators" :[300],
              "criterion": ["gini"]}


rf_clf_cv = GridSearchCV(rf_clf, param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

rf_clf_cv.fit(X_train,y_train)

rf_clf_best = rf_clf_cv.best_estimator_

print(rf_clf_cv.best_estimator_)
print(rf_clf_cv.best_score_)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier()
gbm_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.01],
              'max_depth': [8],
              'min_samples_leaf': [150],
              'max_features': [0.3] 
              }

gbm_cv = GridSearchCV(gbm, param_grid = gbm_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gbm_cv.fit(X_train,y_train)

gbm_cv_best = gbm_cv.best_estimator_

print(gbm_cv.best_estimator_)
print(gbm_cv.best_score_)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
vo_clf = VotingClassifier(estimators = [('LR', lr_clf), ('RF', rf_clf)], voting = 'soft')
vo_clf.fit(X_train, y_train)

In [None]:
X_train2 = pd.DataFrame({'GBM' : gbm_cv.predict(X_train), 'RF' : rf_clf_cv.predict(X_train), 'LGBM' : lgbm_cv.predict(X_train), 'VO' : vo_clf.predict(X_train)})
X_train2.head()

In [None]:
lr_clf.fit(X_train2, y_train)

X_test2 = pd.DataFrame({'GBM' : gbm_cv.predict(X_test), 'RF' : rf_clf_cv.predict(X_test), 'LGBM' : lgbm_cv.predict(X_test), 'VO' : vo_clf.predict(X_test)})

y_pred = lr_clf.predict(X_test2)

In [None]:
# from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression


# knn_clf = KNeighborsClassifier(n_neighbors = 8)
# ada_clf = AdaBoostClassifier(n_estimators = 100)
# dt_clf = DecisionTreeClassifier(random_state = 11)
# rf_clf = RandomForestClassifier(random_state = 11)
# lr_clf = LogisticRegression(random_state =11)

In [None]:
# knn_clf.fit(X_train, y_train)
# knn_pred = knn_clf.predict(X_train)
# rf_clf.fit(X_train, y_train)
# rf_pred = rf_clf.predict(X_train)
# ada_clf.fit(X_train, y_train)
# ada_pred = ada_clf.predict(X_train)
# dt_clf.fit(X_train, y_train)
# dt_pred = dt_clf.predict(X_train)

# vo_clf = VotingClassifier(estimators = [('LR', lr_clf), ('RF', rf_clf)], voting = 'soft')
# vo_clf.fit(X_train, y_train)
# vo_pred = vo_clf.predict(X_train)

# lr_final = LogisticRegression(C=10)

# knn_pred1 = knn_clf.predict(X_test)
# rf_pred1 = rf_clf.predict(X_test)
# ada_pred1 = ada_clf.predict(X_test)
# dt_pred1 = dt_clf.predict(X_test)
# vo_pred1 = vo_clf.predict(X_test)

In [None]:
# from sklearn.metrics import accuracy_score

# pred = np.array([vo_pred, rf_pred, knn_pred, ada_pred])
# pred1 = np.array([vo_pred1, rf_pred1, knn_pred1, ada_pred1])
# pred = np.transpose(pred)
# pred1 = np.transpose(pred1)

# lr_final.fit(pred, y_train)
# y_pred = lr_final.predict(pred1)
# print('최종 메타 모델의 예측 정확도: {0:.4}'.format(accuracy_score(y_test, final)))

In [None]:
# from sklearn.model_selection import KFold

# knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 8)
# rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 8)
# dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 8)
# ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 8)

# Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis = 1)
# Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis = 1)
# lr_final.fit(Stack_final_X_train, y_train)
# stack_final = lr_final.predict(Stack_final_X_test)

In [None]:
# print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test, stack_final)))

# 4. Submission

In [None]:
os.chdir('/kaggle/input/spaceship-titanic')
print(os.getcwd())
submission = pd.read_csv('sample_submission.csv')

In [None]:
os.chdir('/kaggle/working')
print(os.getcwd())

In [None]:
submission['Transported'] = y_pred
submission.to_csv('submission.csv', index = False)