In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import shap
from termcolor import colored

import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import make_column_selector,make_column_transformer
from scipy.stats import mode, chi2_contingency
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold,train_test_split

import lightgbm
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test =  pd.read_csv("../input/spaceship-titanic/test.csv")
X_train = train.drop("Transported", axis = 1)
y_train = train.Transported.astype(np.int8)
X_test = test.copy()

In [None]:
train.head()

# 1. Feature Engineeering & EDA

In [None]:
train.info()

In [None]:
object_features = [col for col in train.columns if train[col].dtype == "object"]
numeric_features = [col for col in train.columns if train[col].dtype != "object"]
object_features.remove("PassengerId") 
numeric_features.remove("Transported") 

## 1.1. Defining Functions Used in This Notebook

In [None]:
# missing values

def count_missing(data):
    """Function for counting missing values in a dataset"""
    
    for col in data.columns:
        missing = data[col].isna().sum()
        perc = missing/len(data)
        print(f"Feature {col} - Missing Values: {missing} ({perc*100:.2f}%)")

def visualize_missing(data):
    """ Function to analyze missing value dynamics"""
    fig,axes = plt.subplots(ncols = 2, figsize = (12,5))
    msno.bar(data, ax = axes[0])
    msno.heatmap(data, ax = axes[1])
    plt.tight_layout()
    plt.show()
    
def get_dataset():
    train = pd.read_csv("../input/spaceship-titanic/train.csv")
    test =  pd.read_csv("../input/spaceship-titanic/test.csv")
    X_train = train.drop("Transported", axis = 1)
    y_train = train.Transported.astype(np.int8)
    return X_train,y_train,test

def fill_missing(data,impute_type = "most_frequent"):
    
    if impute_type == "most_frequent":
        
    
        for obj in object_features:
            mfv = mode(data[obj])[0][0]
            print(f"Feature {obj} - Most Frequent Variable: {mfv} ({data[obj].value_counts()[mfv]*100/len(data):.2f}%)")
            data[obj] = data[obj].fillna(mfv)
        print("Replaced All Missing Values with the Most Frequent Variable\n")
        del mfv
        
        si = SimpleImputer(strategy = "median")
        data[numeric_features] = si.fit_transform(data[numeric_features])
        data[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].astype(np.float32)

        return data
    
    elif impute_type == "iterative":
        imputer_obj = SimpleImputer(strategy = 'most_frequent')
        imputer_num = IterativeImputer(initial_strategy = 'median',max_iter = 100)
        data_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP','Name','Age',"RoomService","FoodCourt",'ShoppingMall','Spa','VRDeck']
        col_trans = make_column_transformer((imputer_obj,make_column_selector(dtype_include = "object",dtype_exclude = "float64")),(imputer_num,make_column_selector(dtype_include = "float64")))
        col_trans.fit(data)
        data = col_trans.transform(data)
        data = pd.DataFrame(data, columns = data_cols)
        data[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].astype(np.float32)
        return data,col_trans
    
def preprocessing_function(data,keep_id = False):

        if keep_id:
            IDs = data["PassengerId"]
        else:
            pass
        
        # convert binary
        data["VIP"] = data["VIP"].astype(np.int8)
        data["CryoSleep"] = data["CryoSleep"].astype(np.int8)
        
        # extract Surnames
        data["Surnames"] = data["Name"].str.split(expand = True).values[:,1] 
        data.drop("Name", axis = 1, inplace = True)
        
        # separate Cabin variable to 3 distinct features
        data["Deck"] = data["Cabin"].str.split("/", expand = True)[0]
        data["Num"] = data["Cabin"].str.split("/", expand = True)[1]
        data["Side"] = data["Cabin"].str.split("/", expand = True)[2]
        data.drop("Cabin", axis = 1, inplace = True)
        
        # make more features
        data["Total_Lux_Expense"] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis = 1)
        
        # separate groupID from passengerID
        data["Group"] = data["PassengerId"].str.split("_",expand = True)[0].astype("category")
        data.drop("PassengerId", axis = 1, inplace = True)
        
        if keep_id:
        
            return IDs,data
        else:
            return data

def target_encoder(cols_encode,X_train,y_train,X_test, smooth = 0.22):
    print("--- Before encoding ---\n")
    for col in cols_encode:
        
        print(f"Training set - {col} - # of unique variables: {X_train[col].nunique()}")
        print(f"Test set - {col} - # of unique variables: {X_test[col].nunique()}")
        print()
    oof = pd.DataFrame()
    skf = StratifiedKFold(n_splits = 5, random_state = 1, shuffle = True)
    for train_idx,val_idx in skf.split(X_train,y_train):
        target_encoder = ce.TargetEncoder(cols = cols_encode,smoothing = smooth)
        target_encoder.fit(X_train.loc[train_idx,cols_encode], y_train.loc[train_idx])
        oof = oof.append(target_encoder.transform(X_train.loc[val_idx,cols_encode]))
    target_encoder = ce.TargetEncoder(cols = cols_encode,smoothing = smooth)
    target_encoder.fit(X_train,y_train)
    X_train[cols_encode] = oof.sort_index()
    X_test = target_encoder.transform(X_test)
    for col in cols_encode:
        print("--- After encoding ---\n")
        print(f"Training set - {col} - # of unique variables: {X_train[col].nunique()}")
        print(f"Test set - {col} - # of unique variables: {X_test[col].nunique()}")
    return X_train,X_test

# H0: Two categorical features are not dependent.
# H1: Two categorical features are dependent.
def dependency_control(col,col2 = None, with_target = True):
    if with_target:
        
        array1 = X_train[col].values
        transport_array = y_train.values.astype(str)

        stat_df = pd.DataFrame()
        stat_df[col] = array1
        stat_df["Transported"] = transport_array

        #convert cross tab format
        cr_tab = pd.crosstab(index = stat_df[col],
                            columns = stat_df["Transported"],
                            )

        test_stats,p,_,_ = chi2_contingency(cr_tab)
        print(f"P value: {p}")
        if p < 0.05:
            print("Independent from Target")
        else:
            print("Dependent to Target")
    else:
        array1 = X_train[col].values
        array2 = X_train[col2].values
        stat_df = pd.DataFrame()
        stat_df[col] = array1
        stat_df[col2] = array2
        cr_tab = pd.crosstab(index = stat_df[col],
                            columns = stat_df[col2],
                            )

        test_stats,p,_,_ = chi2_contingency(cr_tab)
        print(f"P value: {p}")
        if p < 0.05:
            print(f"{col} and {col2} are independent")
        else:
            print(f"{col} and {col2} are dependent")
            

def experiment_cols(categorical_features = np.array(["Group","HomePlanet","Destination","Deck","Num","Side"])):
    seed = [1,3,5]
    mean_losses = []
    for i in range (6):
        if i == 0:
            
            print(colored("No categorical columns\n",attrs = ["bold"]))
            loss = []
            for s in seed:
                print(f"Seed: {s}")
                model = LGBMClassifier(random_state = 42)
                X_trn,X_val,y_trn,y_val = train_test_split(X_train.drop(X_train.select_dtypes(include = "category").columns,axis = 1),y_train,stratify = y_train,random_state = s)
                model.fit(X_trn,y_trn, eval_set = [(X_val,y_val)],callbacks = [lightgbm.log_evaluation(period = 0)])
                min_loss = np.min(model.evals_result_["valid_0"]["binary_logloss"])
                print(f'Minimum Logloss: {min_loss}')
                loss.append(min_loss)
            mean_loss = np.mean(loss)
            print(colored(f"\nAverage Logloss: {mean_loss}",attrs=["bold"]))
            mean_losses.append(mean_loss)
                  
        else:
            loss = []
            exp_cols = categorical_features[:i]
            print(colored(f"\nCategorical Column in: {exp_cols}\n", attrs = ["bold"]))
            for s in seed:
                print(f"Seed: {s}")
                exclude_cols = np.setdiff1d(categorical_features,exp_cols)
                model = LGBMClassifier(random_state = 42, categorical_features = exp_cols)
                X_tra = X_train.drop(exclude_cols, axis = 1)
                X_trn,X_val,y_trn,y_val = train_test_split(X_tra,y_train,stratify = y_train,random_state = s)
                model = LGBMClassifier(categorical_features = exp_cols)
                model.fit(X_trn,y_trn, eval_set = [(X_val,y_val)],callbacks = [lightgbm.log_evaluation(period = 0)])
                min_loss = np.min(model.evals_result_["valid_0"]["binary_logloss"])
                print(f'Minimum Logloss: {min_loss}')
                loss.append(min_loss)
            mean_loss = np.mean(loss)
            print(colored(f"\nAverage Logloss: {mean_loss}",attrs=["bold"]))
            mean_losses.append(mean_loss)
        loss_reduction = -np.diff(np.array(mean_losses))
    print()
    for i,(col,loss) in enumerate(zip(categorical_features,loss_reduction)):
        print(f"Adding Feature {col} - Reduced Validation Loss: {loss_reduction[i]}") 

## 1.2. Missing Data Analysis

* Except for a couple of columns, almost all features have missing values. Therefore they have to be handled before putting data into the model. 
* Missing values can be considered as MCAR. Because any feature does not depend on another.

In [None]:
count_missing(train)

In [None]:
count_missing(X_test)

In [None]:
visualize_missing(train)

In [None]:
visualize_missing(test)

## 1.3. Continuous Feature Analysis

* Train and Test data have similar continuous feature distributions.
* Peak points differ a bit. 
* Age is bimodal other ones are positively skewed.

In [None]:
fig,axes = plt.subplots(nrows = 2, ncols = 3, figsize = (16,9))
col_no = 0
row_no = 0
for col in numeric_features:
    sns.kdeplot(train[col], ax = axes[row_no][col_no], fill = True, label = "Train")
    sns.kdeplot(X_test[col], ax = axes[row_no][col_no], fill = True, label = "Test")
    axes[row_no][col_no].set_yticks([])
    axes[row_no][col_no].legend()
    if (col_no+1)%3 == 0:
        col_no = 0
        row_no += 1
    else:
        col_no += 1
plt.tight_layout()

plt.show()

## 1.4. Iterative Filling

In this section the missing values,

* At categorical columns were filled with the most frequent ones.
* At numerical columns filled iteratively. The iteration process starts with selecting the feature which has the least missing values. The missing value is filled according to the similarity of the other instances.

**Other Preprocessing Techniques Used in This Section**

**1. Target Encoding:** Surnames have lot less unique number of elements than names yet it is still too much (2217 - 1726). Therefore they have handled via TargetEncoding. Target encoding for "Groups" feature works well in training set but same thing is not true for test set. Regularization parameter becomes really dominant so cardinality of test set decreses to 1 even in small regularization coefficients.

**2. Split Cabin Feature**

**3. Split PassengerId Feature**

**4. Add Total Expenses**

**5. Drop ID Column**

**6. Convert Booleans to binary features** 

In [None]:
for col in X_train[X_train.select_dtypes(include = ["object","category"]).columns]:
    print(f"Feature: {col} - # of Unique Elements: {X_train[col].nunique()}")

In [None]:
# iterative filling
X_train,y_train,X_test = get_dataset()
X_train,col_trans = fill_missing(X_train,"iterative")
data_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP','Name','Age',"RoomService","FoodCourt",'ShoppingMall','Spa','VRDeck']
X_test = col_trans.transform(X_test)
X_test = pd.DataFrame(X_test,columns = data_cols)
X_train = preprocessing_function(X_train)
IDs,X_test = preprocessing_function(X_test,keep_id = True)
X_train,X_test = target_encoder(["Surnames"],X_train,y_train,X_test)
X_train[X_train.select_dtypes(include = "object").columns] = X_train[X_train.select_dtypes(include = "object").columns].astype("category")


### 1.4.1. Categorical Column Dependency Check 

In this section categorical columns and target feature are considered. Since corr() method is not applicable to those, chi2 test was used to check dependency.

**Findings:** Besides the relation between side and destination, all other combination of features are independent from each other.

In [None]:
dependency_control("Side")

In [None]:
dependency_control("Num")

In [None]:
dependency_control("Deck")

In [None]:
dependency_control("Destination")

In [None]:
dependency_control("Deck","Destination",0)

In [None]:
dependency_control("Side","Destination",0)

In [None]:
dependency_control("Deck","Side",0)

### 1.4.2. Model Performance on Iterative Filling

* For categorical columns whether a feature is helpful for prediction seems vague. Therefore in this section LGBM Model includes them one by one and shows which feature helps to which extent.
* "Deck" and "HomePlanet" features are the ones which are significantly improve performances. "Destination" and "Side" improves a bit. On the other hand "Num" feature worsen the model performance and "Group" feature has no effect; therefore it shouldn't be included in final prediction. 

In [None]:
experiment_cols()

## 1.5. Simple Filling

In this section the missing values,

* At categorical columns were filled with the most frequent ones.
* At numerical columns were filled with median. 

**Other Preprocessing Techniques Used in This Section**

**1. Target Encoding:** Surnames have lot less unique number of elements than names yet it is still too much (2217 - 1725). Therefore they have handled via TargetEncoding.

**2. Split Cabin Feature**

**3. Add Total Expenses**

**4. Drop ID Column**

**5. Convert Booleans to binary features** 

In [None]:
# simple filling
X_train,y_train,X_test = get_dataset()
X_train = fill_missing(X_train)
X_test = fill_missing(X_test)
data_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP','Name','Age',"RoomService","FoodCourt",'ShoppingMall','Spa','VRDeck']
X_train = preprocessing_function(X_train)
IDs,X_test = preprocessing_function(X_test,keep_id = True)
X_train,X_test = target_encoder(["Surnames"],X_train,y_train,X_test)
X_train[X_train.select_dtypes(include = "object").columns] = X_train[X_train.select_dtypes(include = "object").columns].astype("category")

### 1.5.2. Model Performance on Simple Filling

* Iterative filling is slightly better on model performance.
* Features have the same effect discussed in the previous section.

In [None]:
experiment_cols()

# 2. Final Model

In [None]:
# data preparation
X_train,y_train,X_test = get_dataset()
X_train,col_trans = fill_missing(X_train,"iterative")
data_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP','Name','Age',"RoomService","FoodCourt",'ShoppingMall','Spa','VRDeck']
X_test = col_trans.transform(X_test)
X_test = pd.DataFrame(X_test,columns = data_cols)
X_test[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = X_test[['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].astype(np.float32)
X_train = preprocessing_function(X_train)
IDs,X_test = preprocessing_function(X_test,keep_id = True)
X_train,X_test = target_encoder(["Surnames"],X_train,y_train,X_test)
X_train[X_train.select_dtypes(include = "object").columns] = X_train[X_train.select_dtypes(include = "object").columns].astype("category")
X_test[X_test.select_dtypes(include = "object").columns] = X_test[X_test.select_dtypes(include = "object").columns].astype("category")
X_train.drop(["Num","Group"], axis = 1, inplace = True)
X_test.drop(["Num","Group"], axis = 1, inplace = True)

In [None]:
scores = []
model = LGBMClassifier(categorical_features = list(X_train.select_dtypes(include = "category").columns), random_state = 42)
model.fit(X_train,y_train)

In [None]:
preds = model.predict(X_test)
preds = preds == 1

In [None]:
submission = pd.DataFrame()
submission["PassengerId"] = IDs
submission["Transported"] = preds
submission.to_csv("Submission.csv",index = False)