In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data= pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
def transform_col_cabin(X):
    list_index=[]
    list_deck=[]
    list_num=[]
    list_side=[]
    for i in range(len(X.index.values)):
        splitted= str(X["Cabin"][i]).split("/")
        if splitted == float("NaN"):
            list_deck.append(float("NaN"))
            list_num.append(float("NaN"))
            list_side.append(float("NaN"))
            continue
        elif len(splitted) != 3:
            list_deck.append(float("NaN"))
            list_num.append(float("NaN"))
            list_side.append(float("NaN"))
            continue
        else:
            list_deck.append(splitted[0])
            list_num.append(int(splitted[1]))
            list_side.append(splitted[2])

    new_cabin_columns= pd.DataFrame({"deck" : list_deck, "num" : list_num, "side": list_side},index = X.index.values.tolist() )
    df = pd.concat([X, new_cabin_columns], axis= 1)
    return df
    
data = transform_col_cabin(data)
test = transform_col_cabin(test)

In [None]:
cols_drop=["Cabin","Name"]
X = data.drop(cols_drop, axis=1)
y = data["Transported"]
test = test.drop(cols_drop, axis=1)

In [None]:
# differ and define col types
def define_col_types(X):
    cols_cat= [cname for cname in X.columns if X[cname].dtype == "object"]
    cols_num= [cname for cname in X.columns if X[cname].dtype in ["float64"]]
    return (cols_num, cols_cat)

numerical_cols, categorical_cols = define_col_types(X)
numerical_cols_test, categorical_cols_test = define_col_types(test)

print(numerical_cols)
print(categorical_cols)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_imputer = SimpleImputer()
cat_imputer = SimpleImputer(strategy="most_frequent")
imputed_X_num = pd.DataFrame(num_imputer.fit_transform(data[numerical_cols]))
imputed_X_cat = pd.DataFrame(cat_imputer.fit_transform(data[categorical_cols]))
imputed_test_num = pd.DataFrame(num_imputer.fit_transform(test[numerical_cols]))
imputed_test_cat = pd.DataFrame(cat_imputer.fit_transform(test[categorical_cols]))

# Imputation removed column names; put them back
imputed_X_num.columns = X[numerical_cols].columns
imputed_X_cat.columns = X[categorical_cols].columns
imputed_test_num.columns = test[numerical_cols_test].columns
imputed_test_cat.columns = test[categorical_cols_test].columns

X = imputed_X_num.join(imputed_X_cat, how='outer')
test = imputed_test_num.join(imputed_test_cat, how='outer')

X.info()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.125, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,accuracy_score

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    print(accuracy_score(preds, y_valid))
    return model

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(test[categorical_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)
num_X_test = test.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [None]:
model = score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)

In [None]:
ids = test['PassengerId']

In [None]:
preds = model.predict(OH_X_test)

In [None]:
df = {
    "PassengerId":[],
    "Transported":[]
}

In [None]:
for _id, pred in zip(ids,preds):
    df["PassengerId"].append(_id)
    df["Transported"].append(pred)

In [None]:
df = pd.DataFrame(df)

In [None]:
df.to_csv("Submission.csv",index=False)