In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from statsmodels.stats.diagnostic import het_white
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# I used the following guide
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

# 0. Helper functions <a name="help"></a>

In [None]:
# Helper function, WOE and IV calculation for prediction and binning
def non_zero_stuff(data):
    return np.count_nonzero(data.isnull())
def len_different(x):
    return len(list(set(x)))

def WOE_based_IV(df,target,cont_var, limits):
    woe_df=pd.DataFrame()
    bins_list=list()
    event_list=list()
    non_event_list=list()
    for i in range(1,len(limits)):
        even_count=np.nansum(df[(limits[i-1]<df[cont_var])&(df[cont_var]<=limits[i])][target]>0)
        non_even_count=np.nansum(df[(limits[i-1]<df[cont_var])&(df[cont_var]<=limits[i])][target]<1)
        event_list.append(even_count)
        non_event_list.append(non_even_count)
        bins_list.append("lower: "+str(limits[i-1])+" - upper: "+str(limits[i]))
        
    woe_df=pd.DataFrame({
        "bin":bins_list,
        "No_events":event_list,
        "No_nonevents":non_event_list
    })
    woe_df["event_pct"]=woe_df["No_events"]/sum(woe_df["No_events"])
    woe_df["nonevent_pct"]=woe_df["No_nonevents"]/sum(woe_df["No_nonevents"])
    woe_df["WOE"]=np.log(woe_df["event_pct"]/woe_df["nonevent_pct"])
    woe_df["IV"]=(woe_df["event_pct"]-woe_df["nonevent_pct"])*woe_df["WOE"]
    return woe_df

def fit_model_using_classifier(alg,
                               dtrain,
                               predictors,
                               target="Transported",
                               performCV=True, 
                               printFeatureImportance=True, 
                               cv_folds=3,
                               repeat=5,
                               scoring='roc_auc',
                               only_top_x_feature=60
                              ):
    """
    I used the function found in this source
    https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    I modified the code slightly
    """

    # Perform cross-validation:
    cv_score=list()
    if performCV:
        for i in range(0,repeat):
            cv_score_temp = cross_val_score(
                            alg, 
                            dtrain[predictors], 
                            dtrain[target], 
                            cv=cv_folds, 
                            scoring=scoring)
            cv_score=cv_score+list(cv_score_temp)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]        
    
    # Print model report:
    print("\nModel Report")
    print("Accuracy : " + str(round(metrics.accuracy_score(
        dtrain[target].values, dtrain_predictions),4)))
    print("AUC Score (Train): " + str(round(
        metrics.roc_auc_score(dtrain[target], dtrain_predprob),4)))
    
    if performCV:
        print("\n Cross validation summary ("+scoring+")")
        print("Average: "+str(round(np.mean(cv_score),4)))
        print("Std    : "+str(round(np.std(cv_score),4)))
        print("Min    : "+str(round(np.min(cv_score),4)))
        print("Max    : "+str(round(np.max(cv_score),4)))
                
    # Print Feature Importance:
    if printFeatureImportance:
        plt.figure(figsize=(20,6))
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.head(only_top_x_feature).plot(kind='bar', title='Feature Importances',fontsize=12, color=col_light)
        plt.ylabel('Feature Importance Score')
    return alg, feat_imp

# Table of contents
0. [Helper functions](#help)
1. [Load and explore data](#introduction)
<br> 1.1 [Manipulate "Cabin" variable](#p1)
<br> 1.2 [Explore other categorical variables](#p2)
<br> 1.3 [Explore numerical variables](#p3)
<br> 1.4 ["Target leakage"?](#p4)
<br> 1.5 [Create Age groups](#p5)
<br> 1.6 [Explore how family membership impact the target and may imrpove fill miss](#p6)
2. [Create training data](#p22)
<br> 2.1 [Finally revisit the 'Age' Variable fill miss](#p221)
3. [Apply the XGBoost Classifier](#p30)
<br> 3.1 [Estimate base model](#p32)
<br> 3.2 [Parameter Tunning](#p33)
 

# 1. Load and explore data <a name="introduction"></a>

In [None]:
sample_submission=pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
PALETTE="rocket"
col_light="#F6B48F"
col_dark="#AD1759"
sns.color_palette("rocket", as_cmap=True)

In [None]:
display(sample_submission.info())
display(sample_submission.head())

In [None]:
display(train.info())
display(train.head())

In [None]:
display(test.info())
display(test.head())

In [None]:
categoricals=["HomePlanet","CryoSleep","Cabin","Destination","VIP"]
numericals=["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

## 1.1 Manipulate "Cabin" variable <a name="p1"></a>

In [None]:
train["Deck"]="Deck|"+train["Cabin"].str.slice(0,1)
train["DeckNumber"]=train["Cabin"].str.slice(2,3)
train["Side"]="S"+train["Cabin"].str.slice(4,5)
train["DeckAndNumber"]=train["Deck"]+train["DeckNumber"]
train["DeckAndSide"]=train["Deck"]+"|"+train["Side"]

test["Deck"]="Deck|"+test["Cabin"].str.slice(0,1)
test["DeckNumber"]=test["Cabin"].str.slice(2,3)
test["Side"]="S"+test["Cabin"].str.slice(4,5)
test["DeckAndNumber"]=test["Deck"]+test["DeckNumber"]
test["DeckAndSide"]=test["Deck"]+"|"+test["Side"]

train[['Groupv', 'Idv']] = train['PassengerId'].str.split('_', expand = True)
test[['Groupv', 'Idv']] = test['PassengerId'].str.split('_', expand = True)

In [None]:
plt.figure(figsize=(20,7))
index=0
for variable in ["Deck", "DeckNumber","Side"]:
    plt.subplot(1,3,index+1)
    ax = sns.barplot(x=variable, y="Transported", 
                     data=train[["Transported",variable]].groupby(variable).mean().reset_index(), 
                     palette=PALETTE)
    plt.title(variable, fontsize=12);
    index+=1
plt.suptitle("Transport rates by Deck related variables", fontsize=20);

plt.figure(figsize=(20,6))
ax = sns.barplot(x="DeckAndNumber", y="Transported", 
                     data=train[["Transported","DeckAndNumber"]].groupby("DeckAndNumber").mean().reset_index(), 
                     palette=PALETTE)
plt.suptitle("DeckAndNumber", fontsize=12);
plt.xticks(rotation=90)
quick_var=["DeckAndNumber","DeckAndSide"]


for var in quick_var:
    plt.figure(figsize=(20,6))
    ax = sns.barplot(x=var, y="Transported", 
                         data=train[["Transported",var]].groupby(var).mean().reset_index(), 
                         palette=PALETTE)
    plt.suptitle(var, fontsize=20);
    plt.xticks(rotation=90)
    
    plt.figure(figsize=(20,6))
    ax = sns.barplot(x=var, y="Transported", 
                         data=train[["Transported",var]].groupby(var).count().reset_index(), 
                         palette=PALETTE)
    plt.title(var+" occurence", fontsize=20);
    plt.xticks(rotation=90)
# Note: Side don't seem to add much information alone, maybe combined with Number and Deck it is more useful...

## 1.2 Explore other categorical variables <a name="p2"></a>

In [None]:
plt.figure(figsize=(20,6))
index=0
for variable in ["HomePlanet", "VIP","CryoSleep", "Destination"]:
    plt.subplot(1,4,index+1)
    ax = sns.barplot(x=variable, y="Transported", 
                     data=train[["Transported",variable]].groupby(variable).mean().reset_index(), 
                     palette=PALETTE)
    plt.title(variable, fontsize=12);
    index+=1
plt.suptitle("Transport rates by Deck related variables", fontsize=20);

## 1.3 Explore numerical variables  <a name="p3"></a>

In [None]:
# Generate total spending
train["TotalSpending"]=train["RoomService"]+train["FoodCourt"]+train["ShoppingMall"]+train["Spa"]+train["VRDeck"]
test["TotalSpending"]=test["RoomService"]+test["FoodCourt"]+test["ShoppingMall"]+test["Spa"]+test["VRDeck"]

train["FoodRelatedSpending"]=train["FoodCourt"]+train["ShoppingMall"]
test["FoodRelatedSpending"]=test["FoodCourt"]+test["ShoppingMall"]

train["LeisureSpending"]=train["RoomService"]+train["Spa"]+train["VRDeck"]
test["LeisureSpending"]=test["RoomService"]+test["Spa"]+test["VRDeck"]

numericals=numericals+["TotalSpending","FoodRelatedSpending","LeisureSpending"]

In [None]:
# Taking the logarithm might help
numericals_log=list()
for numeric_variable in numericals:
    train[numeric_variable+"_log"]=np.log1p(train[numeric_variable])
    test[numeric_variable+"_log"]=np.log1p(test[numeric_variable])
    print("Skew for original variable /"+numeric_variable+"/ : "+str(round(train[numeric_variable].skew(),3)))
    print("Skew for log variable /"+numeric_variable+"_log/ : "+str(round(train[numeric_variable+"_log"].skew(),3)))
    numericals_log.append(numeric_variable+"_log")

In [None]:
plt.figure(figsize=(20,45))
index=0
for numeric_variable in numericals:
    for suffix in ["","_log"]:
        plt.subplot(10,2,index+1)
        plt.hist(train[train["Transported"]][numeric_variable+suffix],
                 density=True,
                 label='Transported', 
                 alpha=0.5,
                 bins=50,
                 color=col_dark);
        plt.hist(train[~train["Transported"]][numeric_variable+suffix], 
                 density=True, 
                 label='Not Transported',
                 alpha=0.5,
                 bins=50,
                 color=col_light);
        plt.title(numeric_variable+suffix)
        plt.legend()
        index+=1
plt.suptitle("Histograms for numerical variables", fontsize=20, y=0.9);
print("I replot these charts for only those passengers whom were not hybernated (CryoSleep variable)")
print(" ... One cannot spend money while spleeping")

In [None]:
plt.figure(figsize=(20,45))
index=0
for numeric_variable in numericals:
    for suffix in ["","_log"]:
        plt.subplot(9,2,index+1)
        plt.hist(train[(train["Transported"])&(train["CryoSleep"]!=True)][numeric_variable+suffix],
                 density=True,
                 label='Transported', 
                 alpha=0.5,
                 bins=50,
                 color=col_dark);
        plt.hist(train[(~train["Transported"])&(train["CryoSleep"]!=True)][numeric_variable+suffix], 
                 density=True, 
                 label='Not Transported',
                 alpha=0.5,
                 bins=50,
                 color=col_light);
        plt.title(numeric_variable+suffix)
        plt.legend()
        index+=1
plt.suptitle("Histograms for numerical variables", fontsize=20,y=0.9);
print("Transported pasengers were less likely to spend anything on SPA, VRdeck...")
print("FoodRelated and Leisure seems to have some classification power.")

In [None]:
fig_data=train[['Age', 'TotalSpending_log',"Transported"]].copy()
fig = plt.figure(figsize=(16, 10), dpi= 80)
grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2)

ax_main = fig.add_subplot(grid[:-1, :-1])
ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[])
ax_bottom = fig.add_subplot(grid[-1, 0:-1], xticklabels=[], yticklabels=[])

# Scatterplot on main ax
ax_main.scatter('Age', 'TotalSpending_log', 
                c=fig_data["Transported"].astype('category').cat.codes, alpha=.9, data=fig_data, 
                cmap="Reds", edgecolors=col_light, linewidths=.5);

# histogram on the right
ax_bottom.hist(fig_data["Age"], 40, histtype='stepfilled', orientation='vertical', color=col_light);
ax_bottom.invert_yaxis()

# # histogram in the bottom
ax_right.hist(fig_data["TotalSpending_log"], 40, histtype='stepfilled', orientation='horizontal', color=col_light);

# # Decorations
ax_main.set(title='Age and Log of total spending compared', xlabel='Age', ylabel='Log of total spending')
ax_main.title.set_fontsize(20)
for item in ([ax_main.xaxis.label, ax_main.yaxis.label] + ax_main.get_xticklabels() + ax_main.get_yticklabels()):
    item.set_fontsize(14)

xlabels = ax_main.get_xticks().tolist()
ax_main.set_xticklabels(xlabels)
plt.show()

In [None]:
# Some zero values, we attempt to create a family level spending variable
train["FamilyId"]=train["PassengerId"].str.slice(0,4)
test["FamilyId"]=test["PassengerId"].str.slice(0,4)
cost_cols=numericals[1:].copy()
cost_summary_data=train[["FamilyId"]+cost_cols].copy().append(test[["FamilyId"]+cost_cols].copy())
cost_summary_data=cost_summary_data.groupby("FamilyId").agg({
        'RoomService':[np.nansum],
        'FoodCourt':[np.nansum],
        'ShoppingMall':[np.nansum],
        'Spa':[np.nansum],
        'VRDeck':[np.nansum],
        'TotalSpending':[np.nansum],
        'FoodRelatedSpending':[np.nansum],
        'LeisureSpending':[np.nansum]
}).reset_index()
cost_cols_family=[f + "Family" for f in cost_cols]
cost_summary_data.columns=["FamilyId"]+cost_cols_family

In [None]:
test=test.merge(cost_summary_data,how="left",on="FamilyId")
train=train.merge(cost_summary_data,how="left",on="FamilyId")

## 1.4 "Target leakage"? <a name="p4"></a>

<font size="3"> Here I generate a helper data table,
I explore the transportation rate per cabin or whether there were transport at all...
Later this can be merged back to the test to get additional insights. Below one can see that the cabins differ, but when the side is abandoned we suddenly have common identifiers. Based on earlier exploration side has little explanatory power, but we should see this later.
</font>

In [None]:
print("No common cabins in train, test data :" +str(set(train["Cabin"]).intersection(set(test["Cabin"]))))
print("But common deck and number values! :" +str(set(train["DeckAndNumber"]).intersection(set(test["DeckAndNumber"]))))

In [None]:
cabin_help_data=train[["DeckAndNumber","Transported"]].groupby("DeckAndNumber").agg({"Transported":[np.nanmean,np.nansum, len]}).reset_index()
cabin_help_data.columns=["DeckAndNumber","TransportationRateTrain","TransportationCountTrain", "TrainSample"]
cabin_help_data_test=test[["DeckAndNumber","PassengerId"]].groupby("DeckAndNumber").agg({"PassengerId":[len]}).reset_index()
cabin_help_data_test.columns=["DeckAndNumber","TestSample"]
cabin_help_data=cabin_help_data.merge(cabin_help_data_test,how="outer",on="DeckAndNumber")
for col in ["TransportationRateTrain","TransportationCountTrain","TrainSample","TestSample"]:
    cabin_help_data[col]=np.where(cabin_help_data[col].isnull(),0,cabin_help_data[col])
cabin_help_data["TotalOccurence"]=cabin_help_data["TrainSample"]+cabin_help_data["TestSample"]
target_leakage_vars=list(cabin_help_data.columns[1:])

In [None]:
display(cabin_help_data.head())

In [None]:
# intersection between ids are empty
set(test["PassengerId"].str.slice(0,4)).intersection(set(train["PassengerId"].str.slice(0,4)))

In [None]:
train["FamilyName"]=train["Name"].str.split().str[-1]
test["FamilyName"]=test["Name"].str.split().str[-1]
# There are some folks who has a relative in the test sample, probably...
len(list(set(test["FamilyName"]).intersection(train["FamilyName"])))

In [None]:
ft=train[["FamilyName","PassengerId"]].copy()
ft["in_train"]=1
fe=train[["FamilyName","PassengerId"]].copy()
fe["in_train"]=0

family_stuff=ft.append(fe).groupby("FamilyName").agg({"in_train":[ np.nansum ,len]}).reset_index()
family_stuff.columns=["FamilyName", "FamilySizeTrain","FamilySize"]

## 1.5 Create Age groups <a name="p5"></a>

In [None]:
optimal_iv=pd.DataFrame()
for i in range(1,14):
    woe_df=WOE_based_IV(df=train,target="Transported",cont_var="Age", limits=[0,i,14,25,49,65])
    optimal_iv=optimal_iv.append(woe_df.head(1))
display(optimal_iv)
# lets use 4 as cut value

In [None]:
optimal_iv=pd.DataFrame()
for i in range(5,25):
    woe_df=WOE_based_IV(df=train,target="Transported",cont_var="Age", limits=[0,4,i,25,49,65])
    optimal_iv=optimal_iv.append(woe_df.head(2).tail(1))
# Use 12...
display(optimal_iv.head(10))

In [None]:
# Not too relevant IV values after we fixed the youngers passengers groups...
optimal_iv=pd.DataFrame()
for i in range(51,70):
    woe_df=WOE_based_IV(df=train,target="Transported",cont_var="Age", limits=[0,4,12,50,i])
    optimal_iv=optimal_iv.append(woe_df.tail(1))
# No relevant age group...
display(optimal_iv)

## 1.6 Explore how family membership impact the target and may improve fill miss  <a name="p6"></a>

In [None]:
# I calculated spending by 
family_data_revisited=train.copy().append(test.copy())

In [None]:
# Generate a variable that identifies passengers who travel alone.
family_number_data=family_data_revisited.groupby("FamilyName")[["PassengerId"]].count().reset_index()
family_number_data.columns=["FamilyName","count"]
travel_alone=list(family_number_data[family_number_data["count"]<2]["FamilyName"])
family_data_revisited["alone"]=np.where(family_data_revisited["FamilyName"].isin(travel_alone),1,0)

In [None]:
# Generate variables
family_data_revisited["Room"]=family_data_revisited["PassengerId"].str.slice(0,4)
minmax_age=family_data_revisited.groupby("Room").agg({"Age":[np.nanmin,np.nanmax]}).reset_index()
minmax_age.columns=["Room","minAge","maxAge"]
family_data_revisited=family_data_revisited.merge(minmax_age,how="left",on="Room")
family_data_revisited["travel_with_small_child"]=np.where(
    (family_data_revisited["Age"]>18) & (family_data_revisited["minAge"]<=4),1,0
)
family_data_revisited["travel_with_juvenile"]=np.where(
    (family_data_revisited["Age"]>18) & (
        family_data_revisited["minAge"]<=12) & (family_data_revisited["minAge"]>4),1,0
)

family_data_revisited.drop(columns=["Room","minAge","maxAge"],inplace=True)

In [None]:
# A few hundrew missing values for HomePlanet and Destination, we aim to fill miss...
family_data_revisited.info()

In [None]:
# Now we attempt to fill miss some homeplanets and destinations values
# We check the family names, if for a family name we only have 1 home planet we try to add that
homeplanet=family_data_revisited[family_data_revisited["HomePlanet"].notnull()].groupby("FamilyName").agg(
    {"HomePlanet":["first",len_different]}).reset_index()
homeplanet.columns=["FamilyName","candidate","len_p"]
homeplanet=homeplanet[homeplanet["len_p"]<2].copy()

family_data_revisited=family_data_revisited.merge(homeplanet[["FamilyName","candidate"]],
                                                  how="left",on="FamilyName")

family_data_revisited["HomePlanet"]=np.where(
    (family_data_revisited["HomePlanet"].isnull()) & (family_data_revisited["candidate"].notnull()),
    family_data_revisited["candidate"],family_data_revisited["HomePlanet"])
family_data_revisited.drop(columns=["candidate"],inplace=True)


dest=family_data_revisited[family_data_revisited["Destination"].notnull()].groupby("FamilyName").agg(
    {"Destination":["first",len_different]}).reset_index()
dest.columns=["FamilyName","candidate","len_p"]
dest=dest[dest["len_p"]<2].copy()

family_data_revisited=family_data_revisited.merge(dest[["FamilyName","candidate"]],
                                                  how="left",on="FamilyName")

family_data_revisited["Destination"]=np.where(
    (family_data_revisited["Destination"].isnull()) & (family_data_revisited["candidate"].notnull()),
    family_data_revisited["candidate"],family_data_revisited["Destination"])
family_data_revisited.drop(columns=["candidate"],inplace=True)


In [None]:
# Finally we fill miss the home plant with cabin related data as well
family_data_revisited["Room"]=family_data_revisited["PassengerId"].str.slice(0,4)

homeplanet=family_data_revisited[family_data_revisited["HomePlanet"].notnull()].groupby("Room").agg(
    {"HomePlanet":["first",len_different]}).reset_index()
homeplanet.columns=["Room","candidate","len_p"]
homeplanet=homeplanet[homeplanet["len_p"]<2].copy()

family_data_revisited=family_data_revisited.merge(homeplanet[["Room","candidate"]],
                                                  how="left",on="Room")

family_data_revisited["HomePlanet"]=np.where(
    (family_data_revisited["HomePlanet"].isnull()) & (family_data_revisited["candidate"].notnull()),
    family_data_revisited["candidate"],family_data_revisited["HomePlanet"])
family_data_revisited.drop(columns=["candidate","Room"],inplace=True)


In [None]:
train=family_data_revisited.head(len(train)).copy()
test=family_data_revisited.tail(len(test)).copy()

# 2. Create training data <a name="p22"></a>

In [None]:
# deck related
#deck_related=["Deck","DeckNumber","Side","DeckAndNumber","DeckAndSide"]
deck_related=["Deck","DeckNumber","Side", "DeckAndNumber"]
family_related=["FamilySizeTrain","FamilySize"]
all_combined=train.copy().append(test.copy())
all_combined=all_combined[["PassengerId","Transported","FamilyName","Age","alone","travel_with_juvenile", "travel_with_small_child"]
                          +numericals_log+categoricals
                          +deck_related+cost_cols_family].copy()
all_combined=all_combined.merge(cabin_help_data,how="left",on="DeckAndNumber")
all_combined=all_combined.merge(family_stuff,how="left",on="FamilyName")

# set spending related missings to 0!
for col in numericals_log[1:]:
    all_combined[col]=np.where(all_combined[col].isnull(),0,all_combined[col])

dummy_list=list()
dummy_list.append(all_combined[["VIP","CryoSleep","alone","travel_with_juvenile", "travel_with_small_child"]])
for categorical in ['HomePlanet', 'Destination']+deck_related[0:3]:
    print("Finished dummies for "+ categorical)
    dummy_temp=pd.get_dummies(all_combined[categorical])
    #for col in dummy_temp.columns:
    #    dummy_temp[col]=np.where(all_combined[categorical].isnull(),np.nan,dummy_temp[col])
    dummy_list.append(dummy_temp)

all_dummies=pd.concat(dummy_list,axis=1)
all_dummies=all_dummies.reset_index().drop(columns="index").copy()

dummy_columns=all_dummies.columns
all_dummies["VIP"]=np.where(all_dummies["VIP"],1,0)
all_dummies["CryoSleep"]=np.where(all_dummies["CryoSleep"],1,0)


final_numericals=[numericals[0]]+numericals_log[1:]+list(cabin_help_data.columns[1:])+family_related+cost_cols_family
model_data=all_combined[["PassengerId","Transported"]+final_numericals].copy()
model_data=pd.concat([model_data,all_dummies],axis=1)

model_data["Baby"]=np.where(model_data["Age"]<=4,1,0)
model_data["Junior"]=np.where((4<model_data["Age"])&(model_data["Age"]<=12),1,0)
model_data["Adult"]=np.where((18<=model_data["Age"]),1,0)

In [None]:

# Transform numerical variables and input missing
scaler = MinMaxScaler()
model_data.loc[:,final_numericals]=scaler.fit_transform(model_data.loc[:,final_numericals])
model_data=model_data.reset_index().drop(columns="index").copy()
model_data["Age_orig"]=model_data["Age"].copy()
imputer = KNNImputer(n_neighbors=10)
all_cols=list(final_numericals)+list(dummy_columns)+["Baby","Junior","Adult"]
model_data.loc[:,all_cols]=imputer.fit_transform(model_data.loc[:,all_cols])
print("Data transformed and fill missed")

## 2.1 Finally revisit the 'Age' Variable fill miss <a name="p221"></a>

In [None]:
# I attempt to predict Age, to see whether we can come up with better solution instead of KNNimputer
regression_fit=model_data[model_data["Age_orig"].notnull()].copy()
X = regression_fit[
    ['Earth', 'Europa', '55 Cancri e', 'TRAPPIST-1e', "CryoSleep"]+
    ['Deck|D','Deck|G']+
    ["FamilySize","FoodRelatedSpending_log","LeisureSpending_log"]]
X2 = sm.add_constant(X)
y = np.array(np.log1p(regression_fit["Age_orig"]))

est = sm.OLS(y, X2)
est = est.fit(cov_type="HC1")
print(est.summary())

In [None]:
print("VIF metrics for variables")
print(pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns))
print('White test heterosedasticity')
print(het_white(est.resid, est.model.exog))

In [None]:
# Resplit into train and test
model_train=model_data.head(len(train)).copy()
model_train["Transported"]=np.where(model_train["Transported"],1,0)
model_test=model_data.tail(len(test)).copy()

In [None]:
# Before finalising, I revisit the idea of using multiplies of certain variable pairs to generate new features
additional_cols=list()
cont_vars=['Age', 'RoomService_log', 'FoodCourt_log',
       'ShoppingMall_log', 'Spa_log', 'VRDeck_log', 'TotalSpending_log',
       'FoodRelatedSpending_log','FamilySize', 'LeisureSpendingFamily',
       'FoodRelatedSpendingFamily']
for i in range(0,len(cont_vars)):
    for j in range(i,len(cont_vars)):
        f_i, _ =f_classif(np.array(model_train[[cont_vars[i]]]).reshape(-1,1),np.array(model_train["Transported"]))
        f_j, _ =f_classif(np.array(model_train[[cont_vars[j]]]).reshape(-1,1),np.array(model_train["Transported"]))
        f_cand, _ =f_classif(np.array(np.sqrt(model_train[cont_vars[i]]*model_train[cont_vars[j]])).reshape(-1,1),np.array(train["Transported"]))
        if f_cand>(1.03*max(f_i,f_j)):
            model_train[cont_vars[i]+"_"+cont_vars[j]]=np.sqrt(model_train[cont_vars[i]]*model_train[cont_vars[j]])
            additional_cols.append(cont_vars[i]+"_"+cont_vars[j])
            print(cont_vars[i]+"_"+cont_vars[j])
            model_test[cont_vars[i]+"_"+cont_vars[j]]=np.sqrt(model_test[cont_vars[i]]*model_test[cont_vars[j]])
            
for i in range(0,len(cont_vars)):
    for j in range(i,len(cont_vars)):
        if i!=j:
            f_i, _ =f_classif(np.array(model_train[[cont_vars[i]]]).reshape(-1,1),np.array(model_train["Transported"]))
            f_j, _ =f_classif(np.array(model_train[[cont_vars[j]]]).reshape(-1,1),np.array(model_train["Transported"]))
            f_cand, _ =f_classif(np.array(np.sqrt((model_train[cont_vars[i]]+1)/(1+model_train[cont_vars[j]]))).reshape(-1,1),np.array(train["Transported"]))
            if f_cand>(1.03*max(f_i,f_j)):
                model_train[cont_vars[i]+"_d_"+cont_vars[j]]=np.sqrt((model_train[cont_vars[i]]+1)/(1+model_train[cont_vars[j]]))
                additional_cols.append(cont_vars[i]+"_d_"+cont_vars[j])
                print(cont_vars[i]+"_d_"+cont_vars[j])
                model_test[cont_vars[i]+"_d_"+cont_vars[j]]=np.sqrt((model_test[cont_vars[i]]+1)/(1+model_test[cont_vars[j]]))
                
all_cols=all_cols+additional_cols

# 3. Apply the XGBoost Classifier <a name="p30"></a>

## 3.1 Estimate base model <a name="p32"></a>

In [None]:
scoref="f1"

In [None]:
gbm0 =GradientBoostingClassifier(random_state=42)
gbm0, feat_imp=fit_model_using_classifier(gbm0, 
                                          dtrain=model_train, 
                                          predictors=all_cols,
                                          repeat=10,
                                          scoring=scoref)

In [None]:
#selected_cols=list(feat_imp[feat_imp>0].index)
# We do not restrict here.
# Earlier to granular deck data introduced some errors, but now we can deal with less number of params
selected_cols=all_cols

In [None]:
base_model_submission=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": gbm0.predict(model_test[all_cols])})
base_model_submission["Transported"]=np.where(
    base_model_submission["Transported"]>0,True,False
)
base_model_submission.to_csv("base_model_submission.csv",index=False)
# Score: 0.79565
# Initial score is decent, top performers only slightly better,
# However my economist identity is puzzled why leisure spending is that important...

## 3.2 Parameter tunning <a name="p33"></a>

In [None]:
random_state=42
optimals_params={}
optimals_params["learning_rate"]=0.2

In [None]:
%%time
# Here I attempt to use grid search cv
# There are 2 type of params:
# Booter parameters, and tree dependent parameters
# First I try try to get the optimal number of trees, with a relatively high learning rate...
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=random_state)
rfs = GradientBoostingClassifier(random_state = random_state,
                                 learning_rate=optimals_params["learning_rate"])
grid={'n_estimators': range(10,210,10)}
search = GridSearchCV(rfs, grid, scoring=scoref, cv=cv, n_jobs=5, verbose=1);
search=search.fit(model_train[selected_cols],model_train["Transported"])

In [None]:
print("Optimal number of trees : " + str(search.best_params_['n_estimators']))
print("Score : " + str(round(search.best_score_,4)))
optimals_params["n_estimators"]=search.best_params_['n_estimators']

In [None]:
%%time
# Second I try to tune tree related parameters
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=random_state)
rfs = GradientBoostingClassifier(random_state = random_state, learning_rate=0.2)
grid={
      'n_estimators': [optimals_params["n_estimators"]],
      'subsample': np.arange(0.01,1.01,0.2),
      'max_features': np.arange(0.01,1.01,0.25),
      'max_depth': range(1,10),
      'min_samples_split': [2,4,10,20,40],
      'min_samples_leaf': [1,2,5,10,20]
}
search2 = GridSearchCV(rfs, grid, scoring=scoref, cv=cv, n_jobs=-1, verbose=1);
search2=search2.fit(model_train[selected_cols],model_train["Transported"])

In [None]:
print("Optimal parameters : " + str(search2.best_params_))
print("Score : " + str(round(search2.best_score_,4)))
for key in search2.best_params_.keys():
    optimals_params[key]=search2.best_params_[key]

In [None]:
%%time
# Finally I try to revisit learning rate
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=random_state)
rfs = GradientBoostingClassifier(random_state = random_state, **optimals_params)
grid={'learning_rate': np.arange(0.01,0.4,0.01)}
search3 = GridSearchCV(rfs, grid, scoring=scoref, cv=cv, n_jobs=-1, verbose=1);
search3=search3.fit(model_train[selected_cols],model_train["Transported"])

In [None]:
print("Optimal learning rate : " + str(search3.best_params_))
print("Score : " + str(round(search3.best_score_,4)))
for key in search3.best_params_.keys():
    optimals_params[key]=search3.best_params_[key]

In [None]:
gbmtune= GradientBoostingClassifier(random_state=random_state,**optimals_params)
gbmtune, _ =fit_model_using_classifier(gbmtune, dtrain=model_train, predictors=selected_cols, scoring="accuracy")

In [None]:
tuned_model_submission=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": gbmtune.predict(model_test[selected_cols])})
tuned_model_submission["Transported"]=np.where(
    tuned_model_submission["Transported"]>0,True,False
)
tuned_model_submission.to_csv("tuned_model_submission.csv",index=False)
# Score: 0.79798

In [None]:
# Decrease learning rate and increase tree number...
optimals_params["learning_rate"]=optimals_params["learning_rate"]/2
optimals_params["n_estimators"]=optimals_params["n_estimators"]*2
gbmtune2 = GradientBoostingClassifier(random_state=random_state,**optimals_params)
gbmtune2, _ =fit_model_using_classifier(gbmtune2, 
                                        dtrain=model_train, 
                                        predictors=selected_cols, scoring=scoref)

tuned_model_submission=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": gbmtune2.predict(model_test[selected_cols])})
tuned_model_submission["Transported"]=np.where(
    tuned_model_submission["Transported"]>0,True,False
)
tuned_model_submission.to_csv("tuned_model_submission_v2.csv",index=False)

In [None]:
optimals_params={"learning_rate":0.1,
                 "min_samples_split":50,
                 "max_depth":10,
                 "min_samples_leaf": 25,
                 'n_estimators':120}
#  'n_estimators': [optimals_params["n_estimators"]],
#       'subsample': np.arange(0.01,1.01,0.2),
#       'max_features': ['auto','sqrt'],
#       'max_depth': range(1,10),
#       'min_samples_split': [2,4,10,20,40],
#       'min_samples_leaf': [1,2,5,10,20]
gbmtune3 = GradientBoostingClassifier(random_state=random_state,**optimals_params)
gbmtune3, _ =fit_model_using_classifier(gbmtune2, 
                                        dtrain=model_train, 
                                        predictors=selected_cols, scoring=scoref)

tuned_model_submission=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": gbmtune3.predict(model_test[selected_cols])})
tuned_model_submission["Transported"]=np.where(
    tuned_model_submission["Transported"]>0,True,False
)
tuned_model_submission.to_csv("tuned_model_submission_v3.csv",index=False)

# For comparison Random forest classifier

In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=300, n_jobs=-1, min_samples_leaf=5)
rf, feat_imp=fit_model_using_classifier(rf, dtrain=model_train, predictors=all_cols, scoring="accuracy")

In [None]:
tuned_model_submission=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": rf.predict(model_test[all_cols])})
tuned_model_submission["Transported"]=np.where(
    tuned_model_submission["Transported"]>0,True,False
)
tuned_model_submission.to_csv("random_forest_submission.csv",index=False)
# around 0.79354 score

# For comparison CatBoost classifier

In [None]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(random_state=42, verbose=False)
cb, feat_imp=fit_model_using_classifier(cb, dtrain=model_train, predictors=all_cols, scoring="accuracy")

tuned_model_submission2=pd.DataFrame({
    "PassengerId": model_test["PassengerId"],
    "Transported": cb.predict(model_test[all_cols])})
tuned_model_submission2["Transported"]=np.where(
    tuned_model_submission2["Transported"]>0,True,False
)
tuned_model_submission2.to_csv("catboost_submission.csv",index=False)