In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

from xgboost import DMatrix
from xgboost import cv
from xgboost import train
from xgboost import XGBClassifier

import warnings

In [None]:
warnings.filterwarnings('ignore')

# read raw data from input
train_raw = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_raw = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Data Preprocessing

In [None]:
standard={}

### PassengerId

In [None]:
train_raw[['Group', 'Number']] = train_raw['PassengerId'].str.split('_', expand=True)

### Homeplanet

In [None]:
standard['HomePlanet'] = train_raw['HomePlanet'].mode()[0]

train_raw['HomePlanet'] = train_raw['HomePlanet'].fillna(standard['HomePlanet'])

sns.countplot(data=train_raw, x= "HomePlanet", hue="Transported");

### CryoSleep

In [None]:
standard['CryoSleep'] = train_raw['CryoSleep'].mode()[0]

train_raw['CryoSleep'] = train_raw['CryoSleep'].fillna(standard['CryoSleep'])

sns.countplot(data=train_raw, x= "CryoSleep", hue="Transported");

### Cabin

In [None]:
standard['Cabin'] = train_raw['Cabin'].mode()[0]

train_raw['Cabin'] = train_raw['Cabin'].fillna(standard['Cabin'])

train_raw[['Deck', 'CabinNumber', 'Side']] = train_raw['Cabin'].str.split('/', expand=True)

#sns.countplot(data=train_raw, x= "Deck", hue="Transported");
#sns.countplot(data=train_raw, x= "CabinNumber", hue="Transported");
#sns.countplot(data=train_raw, x= "Side", hue="Transported");

### Destination

In [None]:
standard['Destination'] = train_raw['Destination'].mode()[0]

train_raw['Destination'] = train_raw['Destination'].fillna(standard['Destination'])

sns.countplot(data=train_raw, x= "Destination", hue="Transported");

### Age

In [None]:
standard['Age'] = train_raw['Age'].mean()

train_raw['Age'] = train_raw['Age'].fillna(standard['Age'])

# sns.countplot(data=train_raw, x= "Age", hue="Transported");

### VIP

In [None]:
standard['VIP'] = train_raw['VIP'].mode()[0]

train_raw['VIP'] = train_raw['VIP'].fillna(standard['VIP'])

sns.countplot(data=train_raw, x= "VIP", hue="Transported");

### Spending (RoomService, FoodCourt, ShoppingMall, Spa, VRDeck)

In [None]:
Spending = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for s in Spending:
    standard[s] = train_raw.groupby(by='VIP')[s].mean()
    
    train_raw[s] = train_raw[s].fillna(value=standard[s])
    
S = pd.melt(train_raw[[*Spending, 'Transported']], value_vars = [*Spending],id_vars= 'Transported')
    
ax = sns.displot(S, x='value', hue='Transported', col='variable', kind='kde',col_wrap= 5);
ax.set(xlim = (0,500));

In [None]:
train_raw['Basic'] = train_raw[['FoodCourt', 'ShoppingMall']].sum(axis=1)

ax = sns.displot(train_raw, x= "Basic", hue="Transported");
ax.set(xlim = (0,500));

### Choosing Features

In [None]:
numeric = ['Age', 'Basic']
categorical = ['Group', 'HomePlanet', 'Deck', 'Side', 'Destination']
binary = ['VIP', 'CryoSleep']
drop = ['PassengerId', 'Number', 'Cabin', 'CabinNumber', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']

In [None]:
train_raw = train_raw.drop(columns=drop)

In [None]:
# normaliza
min_max_scaler = MinMaxScaler()
train_num = pd.DataFrame(min_max_scaler.fit_transform(train_raw[numeric]))
train_num.columns = train_raw[numeric].columns

# check if missing
print([col for col in train_num.columns if train_num[col].isnull().any()])

In [None]:
# one-hot encode the columns with categorical values
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_cat = pd.DataFrame(onehot_encoder.fit_transform(train_raw[categorical]))

# check if missing
print([col for col in train_cat.columns if train_cat[col].isnull().any()])

In [None]:
# convert binary columns to int
train_bi = train_raw[binary].replace({True: 1, False: 0})

# check if missing
print([col for col in train_cat.columns if train_cat[col].isnull().any()])

In [None]:
# concatenate the one-hot encoded columns with the columns with numerical values
X = pd.concat([train_num, train_cat, train_bi], axis=1)
X.index = train_num.index
print(X.head())

y = train_raw.Transported

# columns with missing values
cols_train_with_missing = [col for col in X.columns if X[col].isnull().any()]
print(cols_train_with_missing)

In [None]:
from sklearn.linear_model  import LogisticRegression
from sklearn.model_selection import cross_val_score

model_lr = LogisticRegression(random_state=30)

cv_scores = cross_val_score(estimator=model_lr,
                            X=X, y=y,
                            cv=5,
                            scoring='accuracy')

print(cv_scores.mean())

# Tuning Parameters of XGBoost

In [None]:
params = {
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'gamma': 0.,
    'objective': 'binary:logistic',
    'random_state': 30
}

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]


min_error = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = cv(
        params=params,
        dtrain=DMatrix(X, y),
        num_boost_round=1000,
        nfold=5,
        metrics='error',
        early_stopping_rounds=10,
        seed=30,
    )
    # Update best MAE
    mean_error = cv_results['test-error-mean'].min()
    boost_rounds = cv_results['test-error-mean'].argmin()
    print("\tERROR {} for {} rounds".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, ERROR: {}".format(best_params[0], best_params[1], min_error))

params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]


min_error = float("Inf")
best_params = None
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # Update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = cv(
        params=params,
        dtrain=DMatrix(X, y),
        num_boost_round=1000,
        nfold=5,
        metrics='error',
        early_stopping_rounds=10,
        seed=30,
    )
    # Update best MAE
    mean_error = cv_results['test-error-mean'].min()
    boost_rounds = cv_results['test-error-mean'].argmin()
    print("\tERROR {} for {} rounds".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = (subsample, colsample)
print("Best params: {}, {}, ERROR: {}".format(best_params[0], best_params[1], min_error))

params['subsample'] = best_params[0]
params['colsample'] = best_params[1]

In [None]:
min_error = float("Inf")
best_params = None
for gamma in [i/10.0 for i in range(0,5)]:
    print("CV with gamma={}".format(gamma))
    # We update our parameters
    params['gamma'] = gamma
    # Run and time CV
    cv_results = cv(
        params=params,
        dtrain=DMatrix(X, y),
        num_boost_round=1000,
        nfold=5,
        metrics='error',
        early_stopping_rounds=10,
        seed=30,
    )
    # Update best score
    mean_error = cv_results['test-error-mean'].min()
    boost_rounds = cv_results['test-error-mean'].argmin()
    print("\tERROR {} for {} rounds\n".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = gamma
print("Best params: {}, ERROR: {}".format(best_params, min_error))

params['gamma'] = best_params

In [None]:
params['gamma'] = best_params

min_error = float("Inf")
best_params = None
for reg_alpha in [1e-5, 1e-2, 0.1, 1, 100]:
    print("CV with reg_alpha={}".format(reg_alpha))
    # We update our parameters
    params['reg_alpha'] = reg_alpha
    # Run and time CV
    cv_results = cv(
        params=params,
        dtrain=DMatrix(X, y),
        num_boost_round=1000,
        nfold=5,
        metrics='error',
        early_stopping_rounds=10,
        seed=30,
    )
    # Update best score
    mean_error = cv_results['test-error-mean'].min()
    boost_rounds = cv_results['test-error-mean'].argmin()
    print("\tERROR {} for {} rounds\n".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = reg_alpha
print("Best params: {}, ERROR: {}".format(best_params, min_error))

params['reg_alpha'] = best_params

In [None]:
min_error = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = cv(
        params=params,
        dtrain=DMatrix(X, y),
        num_boost_round=1000,
        nfold=5,
        metrics='error',
        early_stopping_rounds=10,
        seed=30,
    )
    # Update best score
    mean_error = cv_results['test-error-mean'].min()
    boost_rounds = cv_results['test-error-mean'].argmin()
    print("\tERROR {} for {} rounds\n".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = eta
print("Best params: {}, ERROR: {}".format(best_params, min_error))

params['eta'] = best_params

# Predicting Test data

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y)
print(x_train.head())

In [None]:
test_raw[['Group', 'Number']] = test_raw['PassengerId'].str.split('_', expand=True)

test_raw['HomePlanet'] = test_raw['HomePlanet'].fillna(standard['HomePlanet'])

test_raw['CryoSleep'] = test_raw['CryoSleep'].fillna(standard['CryoSleep'])

test_raw['Cabin'] = test_raw['Cabin'].fillna(standard['Cabin'])
test_raw[['Deck', 'CabinNumber', 'Side']] = test_raw['Cabin'].str.split('/', expand=True)

test_raw['Destination'] = test_raw['Destination'].fillna(standard['Destination'])

test_raw['Age'] = test_raw['Age'].fillna(standard['Age'])

test_raw['VIP'] = test_raw['VIP'].fillna(standard['VIP'])

test_raw['FoodCourt'] = test_raw['FoodCourt'].fillna(standard['FoodCourt'])
test_raw['ShoppingMall'] = test_raw['ShoppingMall'].fillna(standard['ShoppingMall'])
test_raw['Basic'] = test_raw[['FoodCourt', 'ShoppingMall']].sum(axis=1)

Id = test_raw.PassengerId
# drop unwanted features
test_raw = test_raw.drop(columns=drop)

In [None]:
# normalize the numerical columns
test_num = pd.DataFrame(min_max_scaler.transform(test_raw[numeric]))
test_num.columns = test_raw[numeric].columns

# one-hot encode the categorical columns
test_cat = pd.DataFrame(onehot_encoder.transform(test_raw[categorical]))

# convert binary columns to int
test_bi = test_raw[binary].replace({True: 1, False: 0})

# check if missing
print([col for col in test_num.columns if test_num[col].isnull().any()])
print([col for col in test_cat.columns if test_cat[col].isnull().any()])
print([col for col in train_bi.columns if train_bi[col].isnull().any()])

X_test = pd.concat([test_num, test_cat, test_bi], axis=1)
X_test.index = test_num.index

In [None]:
# train data and predict
model_lr.fit(X, y)
predictions = pd.DataFrame(model_lr.predict(X_test))

result = pd.concat([Id, predictions], axis=1)
result.columns = ['PassengerId', 'Transported']

print(result)

result.to_csv('result.csv', index=False)

In [None]:
# train data and predict
best_model = XGBClassifier(
    n_estimators = 5000,
    max_depth = 10,
    min_child_weight = 7,
    subsample = 0.8,
    colsample_bytree = 0.8,
    gamma = 0.4,
    reg_alpha = 0.01,
    learning_rate = 0.1,
    random_state = 30,
    eval_metric = 'error',
)

best_model.fit(
    X, y,
    eval_set = [(x_train, y_train), (x_valid, y_valid)],
    early_stopping_rounds = 20, 
)
predictions = pd.DataFrame(best_model.predict(X_test))

result = pd.concat([Id, predictions], axis=1)
result.columns = ['PassengerId', 'Transported']

print(result)

result.to_csv('result.csv', index=False)