In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None  # default='warn'

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
submit_df = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
df = [train_df, test_df]
train_df.info()

different datatypes
* numerical : Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
* categorical : HomePlanet, CryoSleep, Destination, VIP
* mixed : Cabin, Name
* target : Transported

In [None]:
train_df_num = train_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported']]
train_df_cat = train_df[['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported']]

# Look at Data

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(data=train_df, x="Age", hue="Transported", binwidth=1, kde=True)

Age looks like a normal distribution 

people less than 10yr are more likely to be transported and people in the 20-30 range 
are less likely

plan:

fill in missing values

make age a normal distribution

Next, look at the numerical data related to using money

In [None]:
numerical_cols = ['ShoppingMall', 'RoomService', 'FoodCourt',  'Spa','VRDeck']

for a in numerical_cols:
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    sns.violinplot(data=train_df, x="Transported", y=a)
    plt.subplot(1,2,2)
    sns.violinplot(data=train_df, x="Transported", y=a)
    plt.ylim([-500,2000])
    plt.show()

Food Court and Shopping Mall have around the same distribution

Spa, VRDeck, and RoomService share the same distribution

plan: 

group up the numerical data into two different values, luxury and regular


Check each values correlations with Transport

In [None]:
plt.subplots(figsize=(10,10))
mask = np.triu(np.ones_like(train_df_num.corr()))
sns.heatmap(train_df_num.corr(), mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, square=True)

The Luxury values from before influence Transported very heavily.

# create combined features

In [None]:
for X in df:
    X['Luxury'] = X[['RoomService','Spa','VRDeck']].sum(axis=1)
    X['Regular'] = X[['FoodCourt','ShoppingMall']].sum(axis=1)
    X['TotalSpent'] = X[['Luxury','Regular']].sum(axis=1)

train_df_num = train_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Regular', 'Luxury', 'TotalSpent', 'Transported']]

plt.subplots(figsize=(10,10))
mask = np.triu(np.ones_like(train_df_num.corr()))
sns.heatmap(train_df_num.corr(), mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, square=True)

Luxury looks promising

plan: 

Normalize all the features

# create AgeGroup column

before creation, fill nan values with mean. follows normal distribution

In [None]:
for X in df:
    X['Age'] = X['Age'].fillna(X['Age'].mean())
    X['AgeGroup'] = pd.cut(X['Age'], 5)
    
train_df[['AgeGroup', 'Transported']].groupby(['AgeGroup'], as_index=False).mean()

Convert AgeGroup into a Categorical Variable

In [None]:
for X in df:
    X.loc[X['Age'] <= 15, 'AgeGroupNum'] = 0
    X.loc[(X['Age'] > 15) & (X['Age'] <= 31), 'AgeGroupNum'] = 1
    X.loc[(X['Age'] > 31) & (X['Age'] <= 47), 'AgeGroupNum'] = 2
    X.loc[(X['Age'] > 47) & (X['Age'] <= 63), 'AgeGroupNum'] = 3
    X.loc[(X['Age'] > 63), 'AgeGroupNum'] = 4

# Transform Nuemrical Data

Use QQplot

first fill nan values

In [None]:
for X in df:
    X['RoomService'] = X['RoomService'].fillna(train_df['RoomService'].median())
    X['FoodCourt'] = X['FoodCourt'].fillna(train_df['FoodCourt'].median())
    X['ShoppingMall'] = X['ShoppingMall'].fillna(train_df['ShoppingMall'].median())
    X['Spa'] = X['Spa'].fillna(train_df['Spa'].median())
    X['VRDeck'] = X['VRDeck'].fillna(train_df['VRDeck'].median())
    
    X['Luxury'] = X['RoomService'] + X['Spa'] + X['VRDeck']
    X['Regular'] = X['FoodCourt'] + X['ShoppingMall']
    X['TotalSpent'] = X['Luxury'] + X['Regular']

    
train_df.isnull().sum()

In [None]:
import scipy.stats as stats

def QQplot(X, col):
    fig, axes = plt.subplots(1,2, figsize=(12,5))
    plt.subplot(1,2,1)
    sns.histplot(x=X[col], kde=True)
    
    plt.subplot(1,2,2)
    stats.probplot(X[col].dropna(), dist="norm", plot=plt)
    plt.tight_layout()
    plt.show()
    
services = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Luxury', 'TotalSpent', 'Transported']]

In [None]:
QQplot(services, 'RoomService')

In [None]:
RoomService_try = services[['RoomService', 'Transported']]

RoomService_try["RoomService_sqrt"] = RoomService_try['RoomService']**(1./2)
RoomService_try["RoomService_cbrt"] = RoomService_try['RoomService']**(1./3)
RoomService_try["RoomService_log(x+1)"] = np.log(RoomService_try['RoomService']+1)

QQplot(RoomService_try, 'RoomService_sqrt')
QQplot(RoomService_try, 'RoomService_cbrt')
QQplot(RoomService_try, 'RoomService_log(x+1)')

In [None]:
plt.subplots(figsize=(8,6))
mask = np.triu(np.ones_like(RoomService_try.corr()))
sns.heatmap(RoomService_try.corr(), mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, center=0, square=True)

log distribution works the best. Apply it to all numerical data

In [None]:
train_df_copy = train_df.copy()

train_df_copy['RoomService'] = np.log(train_df_copy['RoomService'] + 1)
train_df_copy['FoodCourt'] = np.log(train_df_copy['FoodCourt'] + 1)
train_df_copy['ShoppingMall'] = np.log(train_df_copy['ShoppingMall'] + 1)
train_df_copy['Spa'] = np.log(train_df_copy['Spa'] + 1)
train_df_copy['VRDeck'] = np.log(train_df_copy['VRDeck'] + 1)
train_df_copy['Luxury'] = np.log(train_df_copy['Luxury'] + 1)
train_df_copy['TotalSpent'] = np.log(train_df_copy['TotalSpent'] + 1)

df_num = train_df_copy[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Luxury', 'TotalSpent', 'Transported']]

plt.subplots(figsize=(15,10))
mask = np.triu(np.ones_like(df_num.corr()))
sns.heatmap(df_num.corr(), mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, center=0, square=True)

Should remove multicollinearity from data. Keep Luxury column

Try replacing TotalSpent with boolean value Spent

# Create Spent column

In [None]:
for X in df:
    X['Spent'] = 0
    X.loc[X['TotalSpent'] > 0, 'Spent'] = 1

# Categorical Data

In [None]:
train_df_cat = train_df[['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'AgeGroupNum', 'Spent']]

In [None]:
def Catplot(df, x, y):
    plt.subplots(1, 2, figsize = (14, 5))
    plt.subplot(1,2,1)
    sns.countplot( x=df[x].dropna(), hue=df[y])
        
    plt.subplot(1,2,2)
    plt.ylim(0,1)
    sns.lineplot( x=df[x], y=df[y], data=df, ci=None, linewidth=3, marker="o")
    plt.show()


Catplot(train_df_cat, 'HomePlanet', 'Transported')
Catplot(train_df_cat, 'CryoSleep', 'Transported')
Catplot(train_df_cat, 'Destination', 'Transported')
Catplot(train_df_cat, 'VIP', 'Transported')
Catplot(train_df_cat, 'AgeGroupNum', 'Transported')
Catplot(train_df_cat, 'Spent', 'Transported')

# Mixed Values

In [None]:
for X in df:
    X['Group'] =  X['PassengerId'].str.split('_', expand=True)[0]
    X['GroupSize'] =  X.groupby('Group')['Group'].transform('count')
    
Catplot(train_df, 'GroupSize', 'Transported')

Many people travelled alone: 

make boolean column inGroup

In [None]:
for X in df:
    X['inGroup'] = 1
    X.loc[X['GroupSize']==1, 'inGroup'] = 0
    
Catplot(train_df, 'inGroup', 'Transported')

small colleration

Look at Cabin location

In [None]:
for X in df:  
    X[['CabinDeck','CabinNum','CabinSide']] = X['Cabin'].str.split('/', expand=True)

Catplot(train_df, 'CabinDeck', 'Transported')
Catplot(train_df, 'CabinSide', 'Transported')

Look at Name, deduce relatives

In [None]:
for X in df:  
    X[['FirstName','LastName']] = X['Name'].str.split(' ', expand=True)
    
    X['FamilySize'] = X.groupby('LastName')['LastName'].transform('count')
    


In [None]:
QQplot(train_df, 'FamilySize')

FamilySize follows a normal distribution, generally higher size means lower transport prob

Normalize FamilySize

In [None]:
FamilySize_tr = train_df[['FamilySize', 'Transported']]

FamilySize_tr["FamilySize_sqrt"] = FamilySize_tr['FamilySize']**(1./2)
FamilySize_tr["FamilySize_cbrt"] = FamilySize_tr['FamilySize']**(1./3)
FamilySize_tr["FamilySize_log"] = np.log(FamilySize_tr['FamilySize'])

QQplot(FamilySize_tr, 'FamilySize_sqrt')
QQplot(FamilySize_tr, 'FamilySize_cbrt')
QQplot(FamilySize_tr, 'FamilySize_log')

Sqrt Transform best for this case.

Transform data for real now

In [None]:
to_log_transform = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Luxury', 'Regular', 'TotalSpent']

for dataset in df:
    for i in to_log_transform:
        dataset[i] = np.log(dataset[i]+1)

# Associate Categorical Variables

Use Cramer's V correlation

first encode categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()


train_df_cat = train_df[['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'AgeGroupNum', 'Spent', 'Group', 'inGroup', 'CabinDeck', 'CabinNum', 'CabinSide', 'LastName', 'Transported']]

label = LabelEncoder()
train_df_cat_encoded = pd.DataFrame()

for i in train_df_cat.columns:
    train_df_cat_encoded[i] = label.fit_transform(train_df_cat[i])
    
train_df_cat_encoded.head()

In [None]:
from scipy.stats.contingency import association       
    
def Cramers_V(var1, var2) :
  crosstab = np.array(pd.crosstab(index=var1, columns=var2)) # Cross Tab
  return (association(crosstab, method='cramer'))            # Return Cramer's V

# Create the dataFrame matrix with the returned Cramer's V
rows = []

for var1 in train_df_cat_encoded:
    col = []

    for var2 in train_df_cat_encoded:
        V = Cramers_V(train_df_cat_encoded[var1], train_df_cat_encoded[var2]) # Return Cramer's V
        col.append(V)                                             # Store values to subsequent columns  
    rows.append(col)                                              # Store values to subsequent rows
  
CramersV_results = np.array(rows)
CramersV_df = pd.DataFrame(CramersV_results, columns = train_df_cat_encoded.columns, index = train_df_cat_encoded.columns)

In [None]:
plt.subplots(figsize=(20,15))
corr = np.corrcoef(np.random.randn(13, 13))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(CramersV_df, mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, center=0, square=True)

Many collinear terms

usefull for imputing later

# Impute 

First define a function that imputes a variable based on another

In [None]:
def impute_cat(var1, var2):
    print('Before %s Train:' %var2, train_df[var2].isnull().sum())
    print('Before %s Test:' %var2, test_df[var2].isnull().sum())

    test_df['Transported'] = np.NaN
    df_full = pd.concat([train_df, test_df])

    reference = df_full.groupby([var1, var2])[var2].size().unstack().fillna(0)

    for X in df:          
        X_isnull = X[var2].isnull()
        index = X[X_isnull & (X.loc[X_isnull][var1]).isin(reference.index)].index
        X.loc[index, var2] = X.loc[index, var1].map(lambda x: reference.idxmax(axis=1)[x])
    
    print('After %s Train:' %var2, train_df[var2].isnull().sum())
    print('After %s Test:' %var2, test_df[var2].isnull().sum())
    print('\n')

impute cryosleep

In [None]:
Catplot(train_df, 'inGroup', 'CryoSleep')
Catplot(train_df, 'Spent', 'CryoSleep')

Spent is strongly associated with CryoSleep and basically nothing else to a big degree, impute CryoSleep with Spent

In [None]:
print('Before Train:', train_df['CryoSleep'].isnull().sum())
print('Before Test:', test_df['CryoSleep'].isnull().sum())

for X in df:
    X.loc[(X.CryoSleep.isnull()) & (X.Spent == 0), 'CryoSleep' ] = True
    X.loc[(X.CryoSleep.isnull()) & (X.Spent == 1), 'CryoSleep' ] = False


print('After Train:', train_df['CryoSleep'].isnull().sum())
print('After Test:', test_df['CryoSleep'].isnull().sum())

impute VIP

In [None]:
Catplot(train_df, 'Spent', 'VIP')
Catplot(train_df, 'CryoSleep', 'VIP')

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
sns.violinplot(data=train_df, x="VIP", y='Luxury')
plt.subplot(1,2,2)
sns.violinplot(data=train_df, x="VIP", y='Regular')
plt.show()

VIP spends more money on things than Non-VIPs


Since VIPs are a large minority though, and since there isn't a large corrleation between other datatypes. just fill all nan with false

In [None]:
print('Before Train:', train_df['VIP'].isnull().sum())
print('Before Test:', test_df['VIP'].isnull().sum())

for dataset in df:
    dataset['VIP'].fillna(False, inplace=True)
    
print('After Train:', train_df['VIP'].isnull().sum())
print('After Test:', test_df['VIP'].isnull().sum())

Impute Cabin Deck/Side

In [None]:
G_CD_table = train_df.groupby(['Group', 'CabinDeck'])['CabinDeck'].size().unstack().fillna(0)
G_CD_table.head(10)

Passengers in the same group are in the same deck

In [None]:
impute_cat('Group', 'CabinDeck')
impute_cat('Group', 'CabinSide')

Impute Deck and Side based on other highly associated features

In [None]:
impute_cat('LastName', 'CabinDeck')
impute_cat('HomePlanet', 'CabinDeck')
impute_cat('LastName', 'CabinSide')
impute_cat('HomePlanet', 'CabinSide')

Impute Homeplanet and Destination

In [None]:
G_CD_table = train_df.groupby(['Group', 'HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
G_CD_table.head(10)

groups have the same homeplanet

In [None]:
G_CD_table = train_df.groupby(['Group', 'Destination'])['Destination'].size().unstack().fillna(0)
G_CD_table.head(10)

groups have the same destination

Impute based off of Group

In [None]:
impute_cat('Group', 'Destination')
impute_cat('Group', 'HomePlanet')

HomePlanet:
* 2nd best associated feature is LastName
* 3rd best associated feature is CabinDeck

Destination:
* 2nd best associated feature is LastName
* 3rd best associated feature is CabinNum


In [None]:
impute_cat('LastName', 'HomePlanet')
impute_cat('CabinDeck', 'HomePlanet')

impute_cat('LastName', 'Destination')
impute_cat('CabinNum', 'Destination')

Impute FamilySize and LastName

* Group is associated with FamilySize

* filling LastName changes FamilySize

In [None]:
PG_SN = train_df.groupby(['Group', 'LastName'])['LastName'].size().fillna(0)
PG_SN.head(20)

most groups are of the same family

In [None]:
impute_cat('Group', 'LastName')

2nd best feature is HomePlanet

In [None]:
impute_cat('HomePlanet', 'LastName')

Update FamilySize. Apply transformations

In [None]:
for X in df:
    X['FamilySize'] = X.groupby('LastName')['LastName'].transform('count')
    
    X['FamilySize'] = X['FamilySize']**(1./2)
    X['GroupSize'] = np.log(X['GroupSize'])
    
QQplot(train_df, 'FamilySize')
QQplot(train_df, 'GroupSize')

check null values

In [None]:
train_df.isnull().sum()

Drop weakly associated bad/collinear values

First edit some data

In [None]:
for dataset in df:
    dataset['VIP'] = dataset['VIP'].astype(object)
# train_df.dtypes

In [None]:
test_df.drop('Transported', axis=1, inplace=True)

In [None]:
X_train = train_df.copy()
y_train = train_df['Transported'].copy()

X_test = test_df.copy()

#df for preprocessing later
df_prep = [X_train, X_test]

# Preprocessing

In [None]:
# Preprocessing Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


# Select numerical columns
num_cols = [cname for cname in train_df.columns if 
                train_df[cname].dtype in ['float64']]

# Select categorical columns
cat_cols = [cname for cname in train_df.columns if
                    train_df[cname].nunique() < 10 and 
                    (train_df[cname].dtype == "object")]

scale = StandardScaler()
label = LabelEncoder()

# Scale num features
X_train[num_cols] = scale.fit_transform(X_train[num_cols])
X_test[num_cols] = scale.transform(X_test[num_cols])

# Label encode cat features
for X in df_prep:
    for i in cat_cols:
        X[i] = label.fit_transform(X[i])


check correlations again

In [None]:
cat = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP', 'AgeGroupNum', 'inGroup', 'Spent', 'CabinDeck', 'CabinSide', 'Transported']

# Create the dataFrame matrix with the returned Cramer's V
rows = []

for var1 in cat:
    col = []

    for var2 in cat:
        V = Cramers_V(X_train[var1], X_train[var2]) # Return Cramer's V
        col.append(V)                                             # Store values to subsequent columns  
    rows.append(col)                                              # Store values to subsequent rows
  
CramersV_results = np.array(rows)
CramersV_df = pd.DataFrame(CramersV_results, columns = cat, index = cat)

plt.subplots(figsize=(20,10))
corr = np.corrcoef(np.random.randn(10, 10))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(CramersV_df, mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, center=0)

In [None]:
num = X_train[['CryoSleep', 'Age', 'AgeGroupNum', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Luxury', 'Regular', 'TotalSpent', 'Spent', 'GroupSize', 'inGroup', 'FamilySize', 'Transported']].astype('float')


plt.subplots(figsize=(20,20))
corr = np.corrcoef(np.random.randn(17, 17))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(num.corr(), mask=mask, cmap='cool', annot=True, annot_kws={"fontsize":13}, center=0)

In [None]:
for X in df_prep:
    # Drop irrelevant and multicollinear features
    X.drop(['RoomService', 'Spa', 'VRDeck', 'TotalSpent', 'Spent', 'FoodCourt', 'ShoppingMall', 'GroupSize', 'VIP', 'FamilySize'], axis=1, inplace=True)
    
    # Drop high cardinal features
    X.drop(['PassengerId', 'Name', 'Cabin', 'Group', 'AgeGroup', 'AgeGroupNum', 'CabinNum', 'FirstName', 'LastName'], axis=1, inplace=True)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape)

One-Hot Encode

In [None]:
# One Hot Encode
to_onehot = ['HomePlanet', 'Destination', 'CabinDeck']

X_train = pd.get_dummies(X_train, columns=to_onehot)
X_test = pd.get_dummies(X_test, columns=to_onehot)

In [None]:
y_train = X_train['Transported']
X_train.drop(['Transported'], axis=1, inplace=True)

In [None]:
X_train

# Try Multiple Models

Try:
* Logistic Regression
* Support Vector Classifier
* Random Forest
* XGBoost

First define a model-scoring function

In [None]:
def score_model(model):
    scores = cross_val_score(model, X_train, y_train,
                              cv=10,
                              scoring='accuracy')#scikit thinks bigger = better
    
    score = scores.mean()
    return score

In [None]:
from sklearn.model_selection import cross_val_score
import math

#Common Model Algorithms
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
xgb.set_config(verbosity=0)


models = [
    LogisticRegression(max_iter = 2000),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    LGBMClassifier(),
]

models_cols = ['Model', 'Accuracy']
models_table = pd.DataFrame(columns = models_cols)

for i in range(len(models)):
    models_table.loc[i, 'Model'] = models[i].__class__.__name__
    models_table.loc[i, 'Accuracy']= score_model(models[i])

models_table.sort_values(by=['Accuracy'], ascending=False, inplace=True)
models_table

now use gridsearch for hyper parameter tuning

* commented out for speed purposes

In [None]:
# model = XGBClassifier(
#                     objective='binary:logistic',
#                     )

# xgb_params = {
#         'n_estimators': [100, 500, 1000],
#         'learning_rate': [0.01,0.05,0.1,0.15],
#         'max_depth': [4, 8, 12],
#         }

# grid_search = GridSearchCV(model,
#                            xgb_params,
#                            cv=10,
#                            scoring="accuracy",
#                            return_train_score=True,
#                            verbose = 1,
#                           )

# grid_search.fit(X_train, y_train)

# print('\n Best hyperparameters for XGB:')
# print(grid_search.best_params_)

# model = LGBMClassifier()

# lgbm_params = {
#         'n_estimators': [100, 500, 1000],
#         'learning_rate': [0.01,0.05,0.1,0.15],
#         'max_depth': [4, 8, 12],
#         }

# grid_search = GridSearchCV(model,
#                            lgbm_params,
#                            cv=10,
#                            scoring="accuracy",
#                            return_train_score=True,
#                            verbose = 1,
#                           )

# grid_search.fit(X_train, y_train)

# print('\n Best hyperparameters for LGBM:')
# print(grid_search.best_params_)

# model = SVC(probability = True)

# svc_params = {
#             'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
#             'kernel': ['linear', 'rbf'],
#             'gamma': ['scale', 'auto']
#              }

# grid_search = GridSearchCV(model,
#                            svc_params,
#                            cv=10,
#                            scoring="accuracy",
#                            return_train_score=True,
#                            verbose = 1,
#                           )

# grid_search.fit(X_train, y_train)

# print('\n Best hyperparameters for SVC:')
# print(grid_search.best_params_)



In [None]:
xgb_opt = XGBClassifier(learning_rate=0.05, max_depth=4, n_estimators=1000)
lgbm_opt = LGBMClassifier(learning_rate=0.01, max_depth=12, n_estimators=500)
svc_opt = SVC(C=1, gamma='scale', kernel='rbf')

combine all the models.
weight by their accuracy scores

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(estimators= [
                                           ('LGBM', lgbm_opt),
                                           ('SVC', svc_opt),
                                           ('XGB', xgb_opt),
                                            ],
                              voting = 'hard',
                              )

score_model(ensemble_model)

# Submit

In [None]:
ensemble_model.fit(X_train,y_train)

pred = ensemble_model.predict(X_test).astype('bool')

pred[:20]

In [None]:
submit_df["Transported"] = pred
submit_df.to_csv('submission.csv', index = False)