In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.1 Data loading

In [None]:
os.chdir('/kaggle/input/spaceship-titanic')
space_data_tr = pd.read_csv('train.csv')
space_data_te = pd.read_csv('test.csv')

In [None]:
space_data_tr

In [None]:
space_data_te

# 1.2 Check missing values

In [None]:
print(space_data_tr.isnull().sum()) 
print(space_data_te.isnull().sum())

# 2.1 Feature split

In [None]:
def feature_split(df):
    df['Deck'] = df['Cabin'].str.split('/', expand = True)[0]
    df['Num'] = df['Cabin'].str.split('/', expand = True)[1]
    df['Side'] = df['Cabin'].str.split('/', expand = True)[2]
    df['GroupId'] = df['PassengerId'].str[:4]
    df['MemberId'] = df['PassengerId'].str[5:]
    
    return df

space_data_tr = feature_split(space_data_tr)
space_data_te = feature_split(space_data_te)

# 2.2 Feature drop

In [None]:
def feature_drop(df):
    features = ['Cabin', 'PassengerId', 'Name']
    for feature in features:
        df.drop(feature, axis = 1, inplace = True)
    return df

space_data_tr = feature_drop(space_data_tr)
space_data_te = feature_drop(space_data_te)

In [None]:
space_data_tr

In [None]:
space_data_te

# 2.3 Fill missing values

In [None]:
print(space_data_tr['HomePlanet'].isnull().sum())
print(space_data_te['HomePlanet'].isnull().sum())

In [None]:
def fill_values(df):
    df.loc[df['Deck'].str[:1] == 'A', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'B', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'C', 'HomePlanet'] = 'Europa'
    df.loc[df['Deck'].str[:1] == 'G', 'HomePlanet'] = 'Earth'
    
    return df

space_data_tr = fill_values(space_data_tr)
space_data_te = fill_values(space_data_te)

In [None]:
print(space_data_tr['HomePlanet'].isnull().sum())
print(space_data_te['HomePlanet'].isnull().sum())

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

# 2.4 fill NaN

In [None]:
def fill_0(df):
    features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for feature in features:
        df[feature].fillna(0, inplace = True)
    return df

space_data_tr = fill_0(space_data_tr)
space_data_te = fill_0(space_data_te)

In [None]:
space_data_tr['TotalBills'] = space_data_tr['RoomService']+space_data_tr['FoodCourt']+space_data_tr['ShoppingMall']\
                              +space_data_tr['Spa']+space_data_tr['VRDeck']
space_data_te['TotalBills'] = space_data_te['RoomService']+space_data_te['FoodCourt']+space_data_te['ShoppingMall']\
                              +space_data_te['Spa']+space_data_te['VRDeck']

In [None]:
space_data_tr['VIP'].isnull().sum()

In [None]:
space_data_tr.loc[space_data_tr['TotalBills'] == 0, 'VIP'] = False
space_data_te.loc[space_data_te['TotalBills'] == 0, 'VIP'] = False

In [None]:
space_data_tr['VIP'].isnull().sum()

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'most_frequent')

result = imputer.fit_transform(space_data_tr[['HomePlanet', 'Destination', 'Deck', 'Side']])
result1 = imputer.fit_transform(space_data_te[['HomePlanet', 'Destination', 'Deck', 'Side']])

space_data_tr[['HomePlanet', 'Destination', 'Deck', 'Side']] = result
space_data_te[['HomePlanet', 'Destination', 'Deck', 'Side']] = result1


In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
imputer = SimpleImputer(strategy = 'mean')

result2 = imputer.fit_transform(space_data_tr[['Age', 'Num']])
result3 = imputer.fit_transform(space_data_te[['Age', 'Num']])

space_data_tr[['Age', 'Num']] = result2
space_data_te[['Age', 'Num']] = result3

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
space_data_tr['CryoSleep'].fillna('N', inplace = True)
space_data_tr['VIP'].fillna('N', inplace = True)

space_data_te['CryoSleep'].fillna('N', inplace = True)
space_data_te['VIP'].fillna('N', inplace = True)

In [None]:
print(space_data_tr.isnull().sum())
print(space_data_te.isnull().sum())

In [None]:
space_data_tr.head()

# 2.5 One-Hot Encoding

In [None]:
def Label_dummies(df):
    features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'MemberId']
    for feature in features:
         df = pd.get_dummies(df, columns = [feature])
    return df

space_data_tr = Label_dummies(space_data_tr)
space_data_te = Label_dummies(space_data_te)

In [None]:
space_data_tr.head()

# 2.6 Feature, Label split

In [None]:
X_space_data = space_data_tr.drop('Transported', axis = 1, inplace = False)
y_space_data = space_data_tr['Transported']

# 3. Algorithm Selection

In [None]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split

h2o.init()
h2o.no_progress()

train, test = train_test_split(space_data_tr, test_size = 0.2, shuffle = True)
train1, valid = train_test_split(train)

h2o_train = h2o.H2OFrame(train)
h2o_train1 = h2o.H2OFrame(train1)
h2o_valid = h2o.H2OFrame(valid)
h2o_test = h2o.H2OFrame(test)

h2o_train1['Transported']= h2o_train1['Transported'].asfactor()
h2o_valid['Transported']= h2o_valid['Transported'].asfactor()
h2o_test['Transported']= h2o_test['Transported'].asfactor()
max_runtime_secs = 120

aml = H2OAutoML(max_runtime_secs = max_runtime_secs, exclude_algos = ['XGBoost', 'StackedEnsemble'])
aml.train(x = list(space_data_tr.columns), y = 'Transported', training_frame = h2o_train1, leaderboard_frame = h2o_valid)

leaderboard = aml.leaderboard
performance = aml.leader.model_performance(h2o_valid)  # (Optional) Evaluate performance on a test set

model_id = aml.leader.model_id  # 최고 모델 명
accuracy = performance.accuracy()  # 정확도
precision = performance.precision()  # precision
recall = performance.recall()  # recall
F1 = performance.F1()  # f1
auc = performance.auc()  # auc
variable_importance = aml.leader.varimp()  # 중요한 입력 변수

print(model_id, accuracy, precision, recall, F1, auc, variable_importance)
print(performance)
print(aml.leader.varimp_plot())

features = [f for f in space_data_tr.columns if f not in ['Transported']]
pred_val = list(aml.predict(h2o_test[features])[0])
true_val = list(h2o_test['Transported'])
prediction_acc = np.mean(pred_val == true_val)
print('Prediction accuracy: ', prediction_acc)

h2o_te = h2o.H2OFrame(space_data_te)
final = aml.predict(h2o_te)

# 4. Submission

In [None]:
os.chdir('/kaggle/input/spaceship-titanic')
print(os.getcwd())
submission = pd.read_csv('sample_submission.csv')

In [None]:
os.chdir('/kaggle/working')
print(os.getcwd())

In [None]:
submission['Transported'] = final['predict'].as_data_frame().values
submission.to_csv('submission.csv', index = False)