In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Dataset 


**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

**Destination** - The planet the passenger will be debarking to.

**Age** - The age of the passenger.

**VIP** - Whether the passenger has paid for special VIP service during the voyage.

**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

**Name** - The first and last names of the passenger.

**Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.


In [None]:
test =  pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
train.head()

In [None]:
train.describe()

In [None]:
train.dtypes.value_counts()

In [None]:
print(f"The shape of the data is {train.shape[0]} rows and {train.shape[1]} columns")
print(f"Data missing in train is a total of {sum(train.isna().sum())} data points")

In [None]:
train.isna().sum().sort_values(ascending = False)

In [None]:
cols_to_drop = ['Name', 'PassengerId']
train.drop(cols_to_drop, axis = 1, inplace = True)
test.drop(cols_to_drop, axis = 1, inplace = True)


### EDA 

In [None]:
OUTCOME = 'Transported'
ID_VAR = 'PassengerId'
PREDICTORS = [x for x in train.columns if x != OUTCOME and x != ID_VAR]

In [None]:
train[OUTCOME].value_counts().plot(kind = 'bar') # data re balanced

In [None]:
(train.isna().sum() / len(train) * 100).sort_values().plot(kind = 'bar', ylabel = 'percentage of data missing');

In [None]:
t = train.isna().sum()
cols_with_missing = t[t>0].index


In [None]:
imputer = KNNImputer(n_neighbors=50)
lab_enc = LabelEncoder()
def impute_data_frame(df, imputer = imputer):
    data_types = df.dtypes
    cat_missing = data_types[data_types == object]
    cont_missing = data_types[data_types == float]
    cat_to_impute = [i for i in cat_missing.index if i in cols_with_missing]
    cont_to_impute = [i for i in cont_missing.index if i in cols_with_missing]
    df[cont_to_impute] = imputer.fit_transform(df[cont_to_impute])
    for cat in cat_to_impute:
        df[cat]=df[cat].fillna('M').astype(str)
    assert df.isna().sum().max() == 0
        
    return df

train = impute_data_frame(train)
test = impute_data_frame(test)

### Prepare data for modelling

In [None]:
X_train = train[PREDICTORS]
y_train = train[OUTCOME]

X_test = test[PREDICTORS]


assert OUTCOME not in X_train.columns

In [None]:
# as cabin contains too many unique values, we extract the letter at the start and drop room no
def format_cabin(df):
    _df= df.copy()
    _df.loc[:, 'Cabin'] = [c[0] for c in df['Cabin']]
    return _df

X_train = format_cabin(X_train)
X_test = format_cabin(X_test)

In [None]:
def run_one_hot(df):
    return pd.get_dummies(df)

X_train = run_one_hot(X_train)
X_test = run_one_hot(X_test)

In [None]:
mm = MinMaxScaler()
mm.fit(X_train)
X_train = mm.transform(X_train)
X_test = mm.transform(X_test)

### XGboost

In [None]:
def tune_xgbr(X_train, y_train):
    params = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

    xgbr = xgb.XGBRegressor()

    grid = RandomizedSearchCV(estimator = xgbr,
                           param_distributions = params,                        
                           cv = 3,
                              n_iter = 10,
                              verbose=1,
                           n_jobs = -1)

    grid.fit(X_train,y_train)

    return grid.best_params_

In [None]:
# Selected params, not rerun 
# tune_xgbr(X_train, y_train)

d={'subsample': 0.5,
 'objective': 'reg:squarederror',
 'n_estimators': 500,
 'min_child_weight': 5,
 'max_depth': 7,
 'learning_rate': 0.01,
 'colsample_bytree': 0.5}

print(d)

In [None]:
xgbr = xgb.XGBRegressor(subsample= 0.5,
                        objective = 'reg:squarederror',
                        n_estimators =  500,
                        min_child_weight = 5,
                        max_depth= 7,
                        learning_rate = 0.01,
                        colsample_bytree= 0.5)
%time xgbr.fit(X_train, y_train)

In [None]:
sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
predictions = xgbr.predict(X_test)
sub['Transported'] = predictions.round().astype(bool) 
sub.to_csv('submission.csv', index=False)