# Importing Libraries & Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
print(train_df.shape)
train_df.head()

In [None]:
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')
print(test_df.shape)
test_df.head()

In [None]:
submission_format = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission_format.head()

# Preprocessing

# Feature Engeneering

****Creating new features for training dataset****

In [None]:
# Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with, so creating new feature Group
train_df['Group'] = train_df['PassengerId'].str.extract('(\d\d\d\d)_', expand=True)

# Cabin got three values deck/num/side, so creating three columns for each
train_df['Deck'] = train_df['Cabin'].str.extract('([A-Z])/', expand=True)
train_df['Deck_No'] = train_df['Cabin'].str.extract('(\d)/', expand=True)
train_df['Side'] = train_df['Cabin'].str.extract('/([A-Z])', expand=True)

# Total amount of money spent on each of the Spaceship Titanic's many luxury amenities
train_df['Bill'] = train_df['RoomService']+train_df['FoodCourt']+train_df['ShoppingMall']+train_df['Spa']+train_df['VRDeck']

# Creating a lastname column, we might find some usefull insights
train_df['Last_Name'] = train_df['Name'].str.extract(' ([A-Z]\w{0,})', expand=True)
train_df.head()

**Creating new features for test dataset**

In [None]:
# Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with, so creating new feature Group
test_df['Group'] = test_df['PassengerId'].str.extract('(\d\d\d\d)_', expand=True)

# Cabin got three values deck/num/side, so creating three columns for each
test_df['Deck'] = test_df['Cabin'].str.extract('([A-Z])/', expand=True)
test_df['Deck_No'] = test_df['Cabin'].str.extract('(\d)/', expand=True)
test_df['Side'] = test_df['Cabin'].str.extract('/([A-Z])', expand=True)

# Total amount of money spent on each of the Spaceship Titanic's many luxury amenities
test_df['Bill'] = test_df['RoomService']+test_df['FoodCourt']+test_df['ShoppingMall']+test_df['Spa']+test_df['VRDeck']

# Creating a lastname column, we might find some usefull insights
test_df['Last_Name'] = test_df['Name'].str.extract(' ([A-Z]\w{0,})', expand=True)
test_df.head()

**Dropping PassengerId, Cabin and Name columns**

In [None]:
train_df1 = train_df.copy()

train_df1.drop(['PassengerId','Cabin','Name'], axis=1, inplace=True)
train_df1.head()

In [None]:
test_df1 = test_df.copy()

test_df1.drop(['PassengerId','Cabin','Name'], axis=1, inplace=True)
test_df1.head()

# EDA

In [None]:
from pylab import rcParams

rcParams['figure.figsize'] = 18, 8
sns.catplot(x='Transported', data=train_df1, kind='count')
plt.show()

In [None]:
for col in train_df1.columns:
    if train_df1[col].dtype == 'object' :
        sns.catplot(x=col, data=train_df1, kind='count', hue='Transported')
        plt.show()

Inference:
1. higher number of Passengers departed from Europa and Mars got transported, Passengers from Earth were not so lucky
2. More than 70% Passengers in cryosleep got transported
3. Passenger traveling to '55 Cancri e' has more frequency of getting transported as compared to other destinations
4. It seems like Toodler, infant, child were transported first, old people were no so lucky, they have low frequency of    getting transported
5. Looks like very few people got VIP pass, but interestingly low number of VIP passengers were transported while more number of regular passenger tranported
6. More number of passengers who were on the Deck B, C and G were transported
7. Passengers on Deck_no 4 and 8 seems to be unlucky, majority of them not transported
8. More number of Passenegers on side s got tranported 

In [None]:
for col in train_df1.columns:
    if train_df1[col].dtype == 'float64' :
        sns.catplot(x='Transported', y=col, data=train_df1, kind='bar')
        plt.show()

Inference:
1. Majority of passengers, who spent more on Room Service were not able to get transported
2. Surprisingly Majority of passengers who spent more on Food Court and Shopping Mall got transported
3. Majority of passengers, who spent more on Spa & VRDeck were not able to get transported
4. Over all passengers who spent more than 1000 were not able to get transported

Seems like food court and Shopping mall were Safe location on spaceship

# Concatenate train & test Data
It will help us gain more insight about the data and model will perform better

In [None]:
# stoaring target & Id feature from train and test dataset into variable
target = train_df['Transported']
IDs = test_df['PassengerId']

# Droping target feature from train data
train_df1.drop('Transported', axis=1, inplace=True)

#concatenating 
df = pd.concat([train_df1, test_df1]).reset_index(drop=True)
df

# Imputation

In [None]:
# imputing categorical values with mode
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=df[col].fillna(df[col].mode()[0])
df

# Handeling Categorical Columns
***Frequency Encoding***

Last_Name and Group column has too many categories , thats why using frequency encoding

In [None]:
df1 = df.copy()
# grouping by frequency
fq = df1.groupby('Last_Name').size() 
# mapping values to dataframe
df1.loc[:, "{}_freq_encode".format('Last_Name')] = df1['Last_Name'].map(fq) 
# drop original column.
df1 = df1.drop(['Last_Name'], axis = 1) 

# grouping by frequency
fq1 = df1.groupby('Group').size() 
# mapping values to dataframe
df1.loc[:, "{}_freq_encode".format('Group')] = df1['Group'].map(fq1) 
# drop original column.
df1 = df1.drop(['Group'], axis = 1) 
df1.head()

**Encoding Nominal Categorical columns with get dummies**

In [None]:
columns = ['HomePlanet','CryoSleep','Destination','VIP','Deck','Deck_No','Side']
for col in columns:
    dummies = pd.get_dummies(df1[col], drop_first=True, prefix=col)
    df1 = pd.concat([df1,dummies], axis=1)
    df1.drop(col, axis=1, inplace=True)
df1.head()

**Scaling the dataset**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
minmax_df = scaler.fit_transform(df1)
minmax_df = pd.DataFrame(minmax_df, columns = df1.columns)
minmax_df.head()

In [None]:
minmax_df.isnull().sum()

# Imputing Numeric columns with knn Imputation

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
minmax_df = pd.DataFrame(imputer.fit_transform(minmax_df),columns = minmax_df.columns)

In [None]:
minmax_df.isnull().sum()

In [None]:
minmax_df

# Split Training & Testing Data

In [None]:
train_final = minmax_df.loc[:train_df.index.max(), :].copy()
test_final = minmax_df.loc[train_df.index.max() + 1:, :].reset_index(drop=True).copy()

In [None]:
test_final

In [None]:
train_final

# Model Selection

In [None]:
!pip install pycaret --ignore-installed llvmlite
!pip install --upgrade pycaret

In [None]:
from pycaret.classification import setup, compare_models

In [None]:
target1 = target.map({False:0, True:1})
target1

In [None]:
_ = setup(data=pd.concat([train_final, target1], axis=1), target='Transported')

In [None]:
compare_models()

''''''gbc	Gradient Boosting Classifier	0.8016	0.8834	0.8330	0.7862	0.8086	0.6031	0.6047	0.6530
catboost	CatBoost Classifier	0.8005	0.8901	0.8196	0.7917	0.8051	0.6009	0.6018	5.0490
lightgbm	Light Gradient Boosting Machine	0.7924	0.8811	0.8095	0.7852	0.7969	0.5847	0.5855	0.2220
xgboost	Extreme Gradient Boosting	0.7895	0.8792	0.7929	0.7900	0.7912	0.5789	0.5793	0.9870
rf	Random Forest Classifier	0.7883	0.8708	0.7590	0.8091	0.7828	0.5769	0.5785	0.8120''''''

# Baseline Models

**Gradient Boosting Classifie**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [None]:
gbc_model = GradientBoostingClassifier(random_state=2)
gbc_model.fit(train_final, target1)

In [None]:
from sklearn.model_selection import KFold, cross_val_score

gbc_results = cross_val_score(gbc_model, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(gbc_results)
print(gbc_results.mean())

In [None]:
gbc_predictions = pd.Series(gbc_model.predict(test_final)).map({0:False, 1:True})
gbc_predictions

In [None]:
gbc_submission = pd.concat([IDs, pd.Series(gbc_predictions, name='Transported')], axis=1)
gbc_submission

In [None]:
gbc_submission.to_csv('./gbc_submission.csv', index=False, header=True)

**Random Forest Classifier**

In [None]:
rfc_model = RandomForestClassifier(random_state=42)
rfc_model.fit(train_final, target1)

In [None]:
rfc_results = cross_val_score(rfc_model, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(gbc_results.mean())

In [None]:
rfc_predictions = pd.Series(rfc_model.predict(test_final)).map({0:False, 1:True})

rfc_submission = pd.concat([IDs, pd.Series(rfc_predictions, name='Transported')], axis=1)

rfc_submission.to_csv('./rfc_submission.csv', index=False, header=True)

**CatBoost Classifier**

In [None]:
from catboost import CatBoostClassifier

In [None]:
catboost_model = CatBoostClassifier(random_state=1, verbose=1)
catboost_model.fit(train_final, target1)

In [None]:
catboost_results = cross_val_score(catboost_model, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(catboost_results.mean())

In [None]:
catboost_predictions = pd.Series(catboost_model.predict(test_final)).map({0:False, 1:True})

catboost_submission = pd.concat([IDs, pd.Series(catboost_predictions, name='Transported')], axis=1)

catboost_submission.to_csv('./catboost_submission.csv', index=False, header=True)

**Light GBM**

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(train_final, target1)

In [None]:
lgbm_results = cross_val_score(lgbm_model, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(lgbm_results.mean())

**Extreme Gradient Boosting**

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(train_final, target1)

In [None]:
xgb_results = cross_val_score(xgb_model, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(xgb_results.mean())

# Ensemble Baseline Models

In [None]:
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier

In [None]:
lr_model = LogisticRegression(random_state=42)
clf_stack = StackingClassifier(classifiers =[xgb_model, lgbm_model, catboost_model, gbc_model, rfc_model], meta_classifier = lr_model, use_probas = True, use_features_in_secondary = True)
ensemble_model = clf_stack.fit(train_final, target1) 

In [None]:
ensemble_results = cross_val_score(clf_stack, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(ensemble_results.mean())

In [None]:
ensemble_predictions = pd.Series(clf_stack.predict(test_final)).map({0:False, 1:True})

ensemble_submission = pd.concat([IDs, pd.Series(ensemble_predictions, name='Transported')], axis=1)

ensemble_submission.to_csv('./ensemble_submission.csv', index=False, header=True)

# Hyperparameter AutoTuning (3 Methods)

### 1. Hyperopt

In [None]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [None]:
space = { 'loss': hp.choice('loss', ['log_loss', 'deviance','exponential']),
    'criterion': hp.choice('criterion', ['friedman_mse', 'squared_error','mse']),
        'max_depth': hp.quniform('max_depth', 1, 1000, 10),
        'learning_rate': hp.uniform('learning_rate', 0.01,1 ),
        'subsample': hp.uniform('subsample', 0.01,1.0 ),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0.0, 1.0),
        'min_samples_split' : hp.uniform ('min_samples_split', 0.0, 1.0),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [None]:
def objective(space):
    hyperopt_gbc = GradientBoostingClassifier(
                                 loss = space['loss'],
                                 criterion = space['criterion'], 
                                 max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 learning_rate = space['learning_rate'],
                                 subsample = space['subsample'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    hyperopt_accuracy = cross_val_score(hyperopt_gbc, train_final, target1, scoring='accuracy', cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -hyperopt_accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()
hyperopt_gbc_best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
hyperopt_gbc_best

In [None]:
hyperopt_gbc = GradientBoostingClassifier(
                                         criterion = 'friedman_mse',
                                         learning_rate = 0.2529000101831134,
                                         loss = 'exponential',
                                         max_depth = 880.0,
                                         max_features = 'log2',
                                         min_samples_leaf = 0.1390590438108078,
                                         min_samples_split = 0.4005619783924089,
                                         subsample = 0.8631048262047397,
                                         n_estimators = 300
                                         )
hyperopt_gbc.fit(train_final, target1)

In [None]:
hyperopt_gbc_results = cross_val_score(hyperopt_gbc, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(hyperopt_gbc_results.mean())

In [None]:
hyperopt_gbc_predictions = pd.Series(hyperopt_gbc.predict(test_final)).map({0:False, 1:True})

In [None]:
hyperopt_gbc_submission = pd.concat([IDs, pd.Series(hyperopt_gbc_predictions, name='Transported')], axis=1)

In [None]:
hyperopt_gbc_submission.to_csv('./hyperopt_gbc_submission.csv', index=False, header=True)

### 2. Genetic Algorithms


In [None]:
from tpot import TPOTClassifier

In [None]:
param = { 'loss': ['log_loss', 'deviance','exponential'],
    'criterion': ['friedman_mse', 'squared_error','mse'],
        'max_depth': [int(x) for x in np.linspace(1, 1000,10)],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 1],
        'subsample': [0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 1],
        'max_features': ['auto', 'sqrt','log2', None],
        'min_samples_leaf': [1, 2, 4,6,8],
        'min_samples_split' : [2, 5, 10,14],
        'n_estimators' : [10, 50, 300, 750, 1200,1300,1500]
    }

In [None]:
tpot_gbc = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 early_stop= 15,
                                 config_dict={'sklearn.ensemble.GradientBoostingClassifier': param}, 
                                 cv = 5, scoring = 'accuracy')
tpot_gbc.fit(train_final,target1)

In [None]:
tpot_gbc_predictions = pd.Series(tpot_gbc.predict(test_final)).map({0:False, 1:True})

tpot_gbc_submission = pd.concat([IDs, pd.Series(tpot_gbc_predictions, name='Transported')], axis=1)

tpot_gbc_submission.to_csv('./tpot_gbc_submission.csv', index=False, header=True)

### 3. Optuna

In [None]:
import optuna

In [None]:
def objective(trial):
        max_depth = trial.suggest_discrete_uniform("max_depth", 1, 15, 1)
        n_estimators = trial.suggest_categorical('n_estimators',[10, 50, 100, 300, 500,1000])
        learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 1, 0.01)
        #loss= trial.suggest_categorical('loss', ['log_loss','exponential','deviance'])
        subsample = trial.suggest_uniform('subsample', 0.01, 1.0)
        #criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error'])
        #max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2', None])
        min_samples_split = int(trial.suggest_discrete_uniform("min_samples_split", 1, 15, 1))
        #min_samples_leaf = int(trial.suggest_discrete_uniform(" min_samples_leaf", 1, 15, 1))
        optuna_gbc = sklearn.ensemble.GradientBoostingClassifier(
                                        learning_rate=learning_rate, 
                                        n_estimators=n_estimators, 
                                        max_depth=max_depth, 
                                        #loss = loss,
                                        subsample=subsample,
                                        random_state=42,
                                        #criterion = criterion,
                                        #max_features = max_features
                                        min_samples_split = min_samples_split
                                        #min_samples_leaf = min_samples_leaf
        )
        return sklearn.model_selection.cross_val_score(optuna_gbc, train_final, target1, n_jobs=-1, cv=5).mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
optuna_gbc = GradientBoostingClassifier(
                                        max_depth = 6,
                                        n_estimators = 10,
                                        learning_rate = 0.29000000000000004,
                                        subsample = 0.8899831698554419,
                                        min_samples_split = 15,
                                        #min_samples_leaf = 10
                                        )
optuna_gbc.fit(train_final,target1)

In [None]:
optuna_gbc_results = cross_val_score(optuna_gbc, train_final, target1, scoring='accuracy', cv=KFold(n_splits=10))
print(gbc_results.mean())

In [None]:
optuna_gbc_predictions = pd.Series(optuna_gbc.predict(test_final)).map({0:False, 1:True})

optuna_gbc_submission = pd.concat([IDs, pd.Series(optuna_gbc_predictions, name='Transported')], axis=1)

optuna_gbc_submission.to_csv('./optuna_gbc_submission.csv', index=False, header=True)

# Pleaase Upvote If you found this notebook helpful :)
## I will be updating this notebook soon in order to achieve higher accuracy