In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
X_raw= pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
X_raw_test= pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

X_raw.head()

In [None]:
def transform_col_cabin(X_raw):
    #Transformation von "Cabin" Spalte in drei separate Spalten
    
    list_index=[]
    list_deck=[]
    list_num=[]
    list_side=[]
    for i in range(len(X_raw.index.values)):
        splitted= str(X_raw["Cabin"][i]).split("/")
        if splitted == float("NaN"):
            list_deck.append(float("NaN"))
            list_num.append(float("NaN"))
            list_side.append(float("NaN"))
            continue
        elif len(splitted) != 3:
            list_deck.append(float("NaN"))
            list_num.append(float("NaN"))
            list_side.append(float("NaN"))
            continue
        else:
            list_deck.append(splitted[0])
            list_num.append(int(splitted[1]))
            list_side.append(splitted[2])

    new_cabin_columns= pd.DataFrame({"deck" : list_deck, "num" : list_num, "side": list_side},index = X_raw.index.values.tolist() )
    df = pd.concat([X_raw, new_cabin_columns], axis= 1)
    return df
    
X_raw_transformed= transform_col_cabin(X_raw)
X_raw_test_transformed = transform_col_cabin(X_raw_test)


In [None]:
def remove_specified_cols(X_raw_transformed, X_raw_test_transformed):

    cols_drop=["Cabin","Name"]

    y= X_raw_transformed["Transported"]
    X_prepared= X_raw_transformed.drop(cols_drop, axis=1)
    X_prepared= X_prepared.drop("Transported", axis=1)

    X_test_prepared= X_raw_test_transformed.drop(cols_drop, axis=1)
    
    return (y,X_prepared,X_test_prepared)

y, X_prepared, X_test_prepared = remove_specified_cols(X_raw_transformed, X_raw_test_transformed)

In [None]:
# differ and define col types
def define_col_types(X):
    cols_cat= [cname for cname in X.columns if X[cname].dtype == "object"]
    cols_num= [cname for cname in X.columns if X[cname].dtype in ["float64"]]
    return (cols_num, cols_cat)

numerical_cols, categorical_cols = define_col_types(X_prepared)

print(numerical_cols)
print(categorical_cols)

In [None]:
X_prepared.info()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_imputer = SimpleImputer()
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer_test = SimpleImputer()
cat_imputer_test = SimpleImputer(strategy="most_frequent")
imputed_X_num = pd.DataFrame(num_imputer.fit_transform(X_prepared[numerical_cols]))
imputed_X_cat = pd.DataFrame(cat_imputer.fit_transform(X_prepared[categorical_cols]))
imputed_X_test_num = pd.DataFrame(num_imputer_test.fit_transform(X_test_prepared[numerical_cols]))
imputed_X_test_cat = pd.DataFrame(cat_imputer_test.fit_transform(X_test_prepared[categorical_cols]))

# Imputation removed column names; put them back
imputed_X_num.columns = X_prepared[numerical_cols].columns
imputed_X_cat.columns = X_prepared[categorical_cols].columns
imputed_X_test_num.columns = X_test_prepared[numerical_cols].columns
imputed_X_test_cat.columns = X_test_prepared[categorical_cols].columns


X_full_ready = imputed_X_num.join(imputed_X_cat, how='outer')
X_test_ready = imputed_X_test_num.join(imputed_X_test_cat, how="outer")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_full_ready, y, test_size=0.25, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier



# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = XGBClassifier(booster='gbtree',
              learning_rate=0.02,
              n_estimators=200, n_jobs=4)
    #model = RandomForestClassifier(random_state=0)
    model.fit(X_train, y_train)
    preds= model.predict(X_valid)
    print(accuracy_score(preds, y_valid))
    return model

def build_production_model(X_train_prod, y_train_prod):
    model = XGBClassifier(booster='gbtree',
              learning_rate=0.02,
              n_estimators=200, n_jobs=4)
    #model = RandomForestClassifier(random_state=0)
    model.fit(X_train_prod, y_train_prod)
    return model

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_cols]))
OH_test_cols = pd.DataFrame(OH_encoder.transform(X_test_ready[categorical_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_test_cols.index = X_test_ready.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)
num_X_test = X_test_ready.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_test_cols], axis=1)

OH_X_test.info()

In [None]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

In [None]:
preds = build_production_model(OH_X_train, y_train).predict(OH_X_test)

In [None]:
ids = X_test_prepared['PassengerId']

In [None]:
df = {
    "PassengerId":[],
    "Transported":[]
}

for _id, pred in zip(ids,preds):
    df["PassengerId"].append(_id)
    df["Transported"].append(pred)#
    
df = pd.DataFrame(df)

In [None]:
df.to_csv("Submission.csv",index=False)