In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data = pd.read_csv("../input/spaceship-titanic/train.csv")
data.columns = [col.lower() for col in data.columns]

In [None]:
data

In [None]:
data.info()

## Fun insights i guess

In [None]:
data["group_id"] = data["passengerid"].apply(lambda x: x.split("_")[0])
data["person_id"]  = data["passengerid"].apply(lambda x: x.split("_")[1])

In [None]:
data["cabin"] = data["cabin"].replace(to_replace = [np.nan], value = ["nan/nan/nan"])

In [None]:
data["deck"] = data["cabin"].apply(lambda x: x.split("/")[0])
data["cabin_num"] = data["cabin"].apply(lambda x: x.split("/")[1])
data["cabin_side"] = data["cabin"].apply(lambda x: x.split("/")[2])
data.drop(["cabin"], axis = 1, inplace = True)

In [None]:
def plot_pie(col_name):
    fig = px.pie(names = data[col_name].value_counts().index, values = data[col_name].value_counts().values)
    fig.show()

In [None]:
plot_pie("homeplanet")

In [None]:
plot_pie("cryosleep")

In [None]:
plot_pie("destination")

In [None]:
px.histogram(data["age"])

In [None]:
plot_pie("vip")

## cynical EDA aside

In [None]:
data

In [None]:
data.groupby("vip")["transported"].value_counts().unstack().plot(kind = "bar")

## data imputation

Col wise 

In [None]:
imputer_dict = {}
def impute_col(df, col_name, strat = "most_frequent", fill_value = "UNK", missing_values = np.nan):
    
    if strat == "constant":
        imputer = SimpleImputer(strategy = strat, fill_value = fill_value, missing_values = missing_values)
    else:
        imputer = SimpleImputer(strategy = strat, missing_values = missing_values)
        
    df[col_name] = imputer.fit_transform(df[col_name].values.reshape(-1,1))
    
    return df, imputer

In [None]:
most_frequent = ["homeplanet", "cryosleep", "destination", "vip"]
mean = ["age", "roomservice", "foodcourt", "shoppingmall", "spa", "vrdeck"]
constant = ["name"]
most_freq_art = ["deck", "cabin_num","cabin_side"]

In [None]:
for col in most_frequent:
    data, imputer_dict[col] = impute_col(data, col)
    
for col in mean:
    data, imputer_dict[col] = impute_col(data, col, strat = "mean")

for col in constant:
    data, imputer_dict[col] = impute_col(data, col, strat = "constant")
    
for col in most_freq_art:
    data, imputer_dict[col] = impute_col(data, col, strat = "most_frequent", missing_values = "nan")

In [None]:
encode_dict = {}
def encode_col(df, cols):
    label_encoder = LabelEncoder()
    df[cols] = label_encoder.fit_transform(df[cols])
    return df, label_encoder

In [None]:
for col in most_frequent + ["transported", "deck", "cabin_side"]:
    data, encode_dict[col] = encode_col(data, col)

In [None]:
data

## converting str's to int's

In [None]:
data.info()

In [None]:
to_int = ["group_id", "person_id", "cabin_num"]
data[to_int] = data[to_int].astype(int)

## drop some cols 

In [None]:
passenger_id = data["passengerid"]
data.drop(["passengerid", "name"], axis = 1, inplace = True)

In [None]:
data.corr()

In [None]:
data.drop(["group_id", "age", "vip"], axis = 1, inplace = True)

In [None]:
x = data.loc[:, ~data.columns.isin(["transported"])]
y = data["transported"]

## feature_scaling

In [None]:
scaler = StandardScaler()

x = scaler.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 177013)

## model_building

In [None]:
lr = GradientBoostingClassifier(n_estimators = 400,max_depth = 5, learning_rate = 0.1, verbose = 1, random_state = 177013)
lr.fit(x_train, y_train)

In [None]:
def eval_model(model, thres = 0.5):
    y_pred = model.predict(x_test)
    y_pred = [1 if x >= thres else 0 for x in y_pred]
    print(classification_report(y_test, y_pred))

In [None]:
eval_model(lr)

## test

In [None]:
test = pd.read_csv("../input/spaceship-titanic/test.csv")
test.columns = [col.lower() for col in test.columns]

In [None]:
def preprocess_test(data):
    data["group_id"] = data["passengerid"].apply(lambda x: x.split("_")[0])
    data["person_id"]  = data["passengerid"].apply(lambda x: x.split("_")[1])
    
    data["cabin"] = data["cabin"].replace(to_replace = [np.nan], value = ["nan/nan/nan"])
    
    data["deck"] = data["cabin"].apply(lambda x: x.split("/")[0])
    data["cabin_num"] = data["cabin"].apply(lambda x: x.split("/")[1])
    data["cabin_side"] = data["cabin"].apply(lambda x: x.split("/")[2])
    data.drop(["cabin"], axis = 1, inplace = True)
    
    for col in most_frequent:
        data[col] = imputer_dict[col].transform(data[col].values.reshape(-1,1))
        
    for col in mean:
        data[col] = imputer_dict[col].transform(data[col].values.reshape(-1,1))
        
    for col in constant:
        data[col] = imputer_dict[col].transform(data[col].values.reshape(-1,1))
        
    for col in most_freq_art:
        data[col] = imputer_dict[col].transform(data[col].values.reshape(-1,1))
        
    for col in most_frequent + ["deck", "cabin_side"]:
        data[col] = encode_dict[col].transform(data[col].values.reshape(-1,1))
        
    data[to_int] = data[to_int].astype(int)
        
    passenger_id = data["passengerid"]
    data.drop(["passengerid", "name"], axis = 1, inplace = True)
    data.drop(["group_id", "age", "vip"], axis = 1, inplace = True)
    
    data = scaler.transform(data)
    
    return data, passenger_id
    

In [None]:
test_x, ids = preprocess_test(test)

In [None]:
y_pred_test = lr.predict(test_x)
y_pred_test = [True if x >= 0.5 else False for x in y_pred_test]

In [None]:
submit = pd.DataFrame({"PassengerId": ids, "Transported": y_pred_test})

In [None]:
submit.to_csv("./submission_1.csv", index = False)