In [None]:
#importing python tools
import numpy as np
import pandas as pd

In [None]:
#Machine Learning Libraries
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from xgboost import XGBRegressor, XGBClassifier #

In [None]:
X_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv") #reading the training file
X_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv") #reading the test file

Y_train = X_train["Transported"]
X_train = X_train.drop(["Transported"], axis=1)
X_test.head()

In [None]:
#spliting the column into 3 seprate columns
def sep_cabin(X_train):
    index=[]
    deck=[]
    num=[]
    side=[]
    for i in range(len(X_train.index.values)):
        splits = str(X_train["Cabin"][i]).split("/")
        if splits == float("NaN"):
            deck.append(float("NaN"))
            num.append(float("NaN"))
            side.append(float("NaN"))
            continue
        elif len(splits) != 3:
            deck.append(float("NaN"))
            num.append(float("NaN"))
            side.append(float("NaN"))
            continue
        else:
            deck.append(splits[0])
            num.append(int(splits[1]))
            side.append(splits[2])

    new = pd.DataFrame({"deck" : deck, "num" : num, "side": side},index = X_train.index.values.tolist() )
    df = pd.concat([X_train, new], axis = 1)
    return df

In [None]:
sep_cabin(X_train)

In [None]:
#Removing Cabin and Name columns
def remove_specified_cols(X_train_new):

    cols_drop =["Cabin","Name"]
    X_clean = X_train_new.drop(cols_drop, axis=1)
    
    return (X_clean)

In [None]:
#Defining Types
def define_col_types(X):
    cols_cat= [cname for cname in X.columns if X[cname].dtype == "object"]
    cols_num= [cname for cname in X.columns if X[cname].dtype in ["float64"]]
    return (cols_num, cols_cat)

In [None]:
class Restructure(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X = sep_cabin(X)
        X_clean = remove_specified_cols(X)
        return X_clean

In [None]:
#Imputing missing values in columns
class Imputer(BaseEstimator, TransformerMixin): 
    def fit(self,X,y=None):
        return self 
    def transform(self,X):
        numerical_cols, categorical_cols = define_col_types(X)
        num_imputer = SimpleImputer()
        cat_imputer = SimpleImputer(strategy="most_frequent")
        imputed_X_num = pd.DataFrame(num_imputer.fit_transform(X[numerical_cols]))
        imputed_X_cat = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]))
        imputed_X_num.columns = X[numerical_cols].columns
        imputed_X_cat.columns = X[categorical_cols].columns
        return imputed_X_num.join(imputed_X_cat, how='outer')

In [None]:
first_pipeline=Pipeline([("Restructure",Restructure()),("Imputer", Imputer())])
X_start=first_pipeline.fit_transform(X_train)
X_test_start=first_pipeline.fit_transform(X_test)

In [None]:
#Encoding
class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        numerical_cols, categorical_cols = define_col_types(X)
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        OH_cat_cols= pd.DataFrame(OH_encoder.fit_transform(X[categorical_cols]))
        OH_cat_cols_test= pd.DataFrame(OH_encoder.transform(X_test_start[categorical_cols]))
        OH_cat_cols.index = X[categorical_cols].index
        OH_cat_cols_test.index = X_test_start[categorical_cols].index
        num_X = X.drop(categorical_cols, axis=1)
        num_X_test = X_test_start.drop(categorical_cols, axis=1)
        OH_X = pd.concat([num_X, OH_cat_cols], axis=1)
        OH_X_test = pd.concat([num_X_test, OH_cat_cols_test], axis=1)
        return (OH_X, OH_X_test)

In [None]:
second_pipeline=Pipeline([("FeatureEncoder",FeatureEncoder())])
OH_X, OH_X_test =second_pipeline.fit_transform(X_start)

In [None]:
X_Train, X_valid, Y_Train, Y_valid = train_test_split(OH_X, Y_train, test_size=0.25, random_state=0)

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_Train, Y_Train)
preds= model.predict(X_valid)
print(accuracy_score(preds, Y_valid))

In [None]:
preds= model.predict(OH_X_test)
ids = X_test['PassengerId']
df = {
    "PassengerId":[],
    "Transported":[]
}
for _id, pred in zip(ids,preds):
    df["PassengerId"].append(_id)
    df["Transported"].append(pred.astype(bool))
df= pd.DataFrame(df)
df.to_csv("Submission.csv",index=False)