In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, auc, roc_curve, mean_absolute_error, mean_squared_error
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
!ls ../input/spaceship-titanic/

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
sample = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.drop(['PassengerId', 'Name'], inplace = True, axis = 1)
train.columns

In [None]:
test_id = test['PassengerId']
test.drop(['PassengerId', 'Name'], inplace = True, axis = 1)
test.columns

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
for i in train.columns:
    if train[i].isnull().sum().any():
        print("Training")
        print(train[i].value_counts())
        print("\n")
        print("Testing")
        print(test[i].value_counts())
        print("*********"*5)
        print()

**For most of the collumns we will be replacing with there mode and for age we will be replacing with mean.**

In [None]:
train.drop('Cabin', inplace = True, axis = 1)
test.drop('Cabin', inplace = True, axis = 1)

In [None]:
for i in train.columns:
    if train[i].isnull().sum().any():
        if i == 'Age':
            train[i] = train[i].fillna(train[i].mean())
            test[i] = test[i].fillna(test[i].mean())
        else:
            train[i] = train[i].fillna(train[i].mode()[0])
            test[i] = test[i].fillna(test[i].mode()[0])
    

In [None]:
print(train.isnull().sum().any(), test.isnull().sum().any())

In [None]:
train.info()

In [None]:
test.info()

**There are many columns which are either string or boolean. So we need to convert them to int or float.**

In [None]:
la = LabelEncoder()
for i in train.columns:
    if train[i].dtype == 'object' or train[i].dtype == 'bool':
        train[i] = la.fit_transform(train[i])

In [None]:
la = LabelEncoder()
for i in test.columns:
    if test[i].dtype == 'object' or test[i].dtype == 'bool':
        test[i] = la.fit_transform(test[i])

In [None]:
train.info()

In [None]:
test.info()

**Conversion completed.**

# Visualization

In [None]:
print(train['Transported'].value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = train['Transported'])
plt.xlabel("Transported", size = 12)
plt.ylabel("Count", size = 12)
plt.title("Age Group", size = 12)

**Target column is balanced**

In [None]:
cor = train.corr()
plt.figure(figsize = (12,8))
sns.heatmap(cor, annot = True)
rel = cor['Transported'].sort_values(ascending = False)

In [None]:
rel

In [None]:
def related(rel):
    l = []
    for i in range (len(rel)):
        if rel[i] > 0:
            l.append(rel.index[i])
    return l

In [None]:
y = train.loc[:, 'Transported']
y.head()

In [None]:
# x = related(rel)
x = train.loc[:, :]
x.drop('Transported', inplace = True, axis = 1)
x.head()

In [None]:
std = StandardScaler()
col = x.columns
std_x = std.fit_transform(x)
x = pd.DataFrame( data = std_x, columns = col)
x.head()

In [None]:
x.describe()

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 42)

# Random Forest + Randomized Search

In [None]:
params = {
    "n_estimators": [i for i in range (10, 800, 10)], 
    "max_depth": [i for i in range (2, 16, 2)],
    "min_samples_leaf" : [i for i in range (15, 30)],
    "min_samples_split" : [i for i in range (8,20)]
}
rcla = RandomForestClassifier()
rcla

In [None]:
grid = RandomizedSearchCV(estimator = rcla, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid.fit(xtrain, ytrain)

In [None]:
best_param = grid.best_params_
best_param

In [None]:
grid.best_score_

In [None]:
rcla = RandomForestClassifier(**best_param)
rcla.fit(xtrain, ytrain)

In [None]:
predicted = rcla.predict_proba(xtrain)[:,1]
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted = rcla.predict_proba(xtest)[:, 1]
test_score = accuracy_score(predicted.round() , ytest)*100
print("Accuracy using Random Forest testing data is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d = {'Name' : ['Random Forest'], 'Training Score': [train_score], 'Testing Score': [test_score], 'Mean Squared_Error': [mae]}
d

# Decision Tree

In [None]:
dcla = DecisionTreeClassifier()
dcla.fit(xtrain, ytrain)

In [None]:
predicted = dcla.predict_proba(xtrain)[:, 1]
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using Decision Tree on training data is {} %".format(train_score))

In [None]:
predicted = dcla.predict_proba(xtest)[:,1]
test_score = accuracy_score(predicted.round() , ytest)*100
print("Accuracy using Decision Tree testing data is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('Decison Tree')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

# XGBoost + Randomized Search

In [None]:
param_grid = {'n_estimators': [10, 25, 50, 75, 100],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'eval_metric': ['mlogloss'],
             'booster' : ['gbtree', 'gblinear'],
             'verbosity': [0, 1, 2, 3],
             'validate_parameters': [True, False]
             }
xcla = XGBClassifier(tree_method='gpu_hist')
xcla

In [None]:
grid = RandomizedSearchCV(estimator = xcla, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid.fit(xtrain, ytrain)

In [None]:
print("Randomized Search best Score is {}".format(grid.best_score_))

In [None]:
best_parameters = grid.best_params_
best_parameters

In [None]:
xcla = XGBClassifier(**best_parameters)
xcla.fit(xtrain, ytrain)

In [None]:
predicted = xcla.predict_proba(xtrain)[:,1]
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using XGBoost + Randomized Search on training is {} %".format(train_score))

In [None]:
predicted = xcla.predict_proba(xtest)[:,1]
test_score = accuracy_score(predicted.round(), ytest)*100
print("Accuracy using XGBoost + Randomized Search on testing is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('XGBoost with Randomized Search')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

# XGBoost

In [None]:
w_xcla = XGBClassifier()
w_xcla.fit(xtrain, ytrain)

In [None]:
predicted = w_xcla.predict_proba(xtrain)[:,1]
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using XGBoost on training is {} %".format(train_score))

In [None]:
predicted = w_xcla.predict_proba(xtest)[:,1]
test_score = accuracy_score(predicted.round(), ytest)*100
print("Accuracy using XGBoost on testing is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('XGBoost')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

# LightGBM

In [None]:
!pip install lightgbm --install-option=--gpu

In [None]:
light = LGBMClassifier(device_type = 'gpu')
light.fit(xtrain, ytrain, eval_set = (xtest, ytest), eval_metric = 'auc', verbose = 10, early_stopping_rounds = 150)

In [None]:
predicted = light.predict(xtrain)
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using LightGBM on training data is {} %".format(train_score))

In [None]:
predicted = light.predict(xtest)
test_score = accuracy_score(predicted.round(), ytest)*100
print("Accuracy using LightGBM testing data is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('LightGBM')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

# CatBoost

In [None]:
cat = CatBoostClassifier()
cat.fit(xtrain, ytrain)

In [None]:
predicted = cat.predict(xtrain)
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy using CatBoost on training data is {} %".format(train_score))

In [None]:
predicted = cat.predict(xtest)
test_score = accuracy_score(predicted.round(), ytest)*100
print("Accuracy using CatBoost testing data is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('CatBoost')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

# Naive Bayes

In [None]:
gb = GaussianNB()
gb.fit(xtrain, ytrain)

In [None]:
predicted = gb.predict(xtrain)
train_score = accuracy_score(predicted.round(), ytrain)*100
print("Accuracy Using GaussianNB is {} %".format(train_score))

In [None]:
predicted = gb.predict(xtest)
test_score = accuracy_score(predicted.round(), ytest)*100
print("Accuracy Using GaussianNB is {} %".format(test_score))

In [None]:
mae = mean_squared_error(predicted, ytest)
mae

In [None]:
d['Name'].append('GaussianNB')
d['Training Score'].append(train_score)
d['Testing Score'].append(test_score)
d['Mean Squared_Error'].append(mae)

In [None]:
acu_data = pd.DataFrame(data = d)
acu_data

# Prediction on Testing Data

In [None]:
test.head()

In [None]:
x = test.loc[:, :]
col = x.columns
x = std.transform(x)
x = pd.DataFrame( data = x, columns = col)
x.head()

In [None]:
predicted = rcla.predict(x)
predicted

In [None]:
sample.head()

In [None]:
predicted = la.inverse_transform(predicted)
predicted

In [None]:
submit = pd.DataFrame(data = {'PassengerId' : test_id, 'Transported' :predicted})
submit

In [None]:
submit.to_csv("submission4.csv", index = False)
submit.head()