In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
data

HomePlanet: The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep: Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage.

Cabin: The cabin number where the passenger is staying.

Destination: The planet the passenger will be debarking to.

Age: The age of the passenger.

VIP: Whether the passenger has paid for special VIP service during the voyage.

RoomService: Amount the passenger has billed for room service.

FoodCourt: Amount the passenger has billed at the food court.

ShoppingMall: Amount the passenger has billed at the shopping mall.

Spa: Amount the passenger has billed at the spa.

VRDeck: Amount the passenger has billed at the VR deck.

Name: The name of the passenger.

Transported: Whether the passenger was transported to another dimension.

In [None]:
data.info()

In [None]:
data.columns[data.isnull().any()]

In [None]:
data[data.columns[data.isnull().any()]].isnull().sum()

In [None]:
data[data.columns[data.isnull().any()]].isnull().sum() * 100 / data.shape[0]

In [None]:
data.loc[data['Transported'] == True, 'Transported'].count() / data['Transported'].count()

In [None]:
print(data.PassengerId.nunique())
print(data.Name.nunique())

In [None]:
data.Name.fillna(value='No_name', inplace=True)

In [None]:
surname = []
for name in data.Name:
    a = name.split()
    surname.append(a[-1])
data['Surname'] = surname

In [None]:
data['Total Spending'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

In [None]:
data.drop(labels=['PassengerId', 'Name'], axis=1, inplace = True)

In [None]:
data.head()

In [None]:
X = data.drop('Transported', axis=1)
y = data['Transported']

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
X_train.select_dtypes(["object"]).columns

In [None]:
X_train.select_dtypes(['float64', 'int64']).columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), 
        ("scaler", StandardScaler()), 
    ])

from sklearn.compose import ColumnTransformer

num_attribs = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total Spending']
cat_attribs = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Surname']

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train_prepared = preprocess_pipeline.fit_transform(X_train[num_attribs + cat_attribs])
y_train_prepared = preprocess_pipeline.transform(X_valid[num_attribs + cat_attribs])

In [None]:
X_train_prepared = pd.DataFrame(X_train_prepared, columns = X_train.columns)
X_valid_prepared = pd.DataFrame(y_train_prepared, columns = X_valid.columns)

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi = pd.DataFrame(make_mi_scores(X_train_prepared, y_train))

corr = pd.DataFrame(X_train_prepared[X_train_prepared.columns].corrwith(y_train), columns=['Correlation'])

relation = mi.join(corr)
relation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier()
dtc.fit(X_train_prepared, y_train)
y_pred_dtc = dtc.predict(X_valid_prepared)
acc_dtc = round(accuracy_score(y_pred_dtc, y_valid) * 100, 2)
acc_dtc

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train_prepared, y_train)
y_pred_rfc = rfc.predict(X_valid_prepared)
acc_rfc = round(accuracy_score(y_pred_rfc, y_valid) * 100, 2)
acc_rfc

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_prepared, y_train)
y_pred_lr = lr.predict(X_valid_prepared)
acc_lr = round(accuracy_score(y_pred_lr, y_valid) * 100, 2)
acc_lr

In [None]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(X_train_prepared, y_train)
y_pred_lsvc = lsvc.predict(X_valid_prepared)
acc_lsvc = round(accuracy_score(y_pred_lsvc, y_valid) * 100, 2)
acc_lsvc

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_prepared, y_train)
y_pred_svc = svc.predict(X_valid_prepared)
acc_svc = round(accuracy_score(y_pred_svc, y_valid) * 100, 2)
acc_svc

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_prepared, y_train)
y_pred_knn = knn.predict(X_valid_prepared)
acc_knn = round(accuracy_score(y_pred_knn, y_valid) * 100, 2)
acc_knn

In [None]:
from sklearn.linear_model import SGDClassifier

sgdc = SGDClassifier()
sgdc.fit(X_train_prepared, y_train)
y_pred_sgdc = sgdc.predict(X_valid_prepared)
acc_sgdc = round(accuracy_score(y_pred_sgdc, y_valid) * 100, 2)
acc_sgdc

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train_prepared, y_train)
y_pred_gbc = gbc.predict(X_valid_prepared)
acc_gbc = round(accuracy_score(y_pred_gbc, y_valid) * 100, 2)
acc_gbc

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('rfc', rfc),
        ('lr', lr),
        ('lsvc', lsvc),
        ('svc', svc),
        ('knn', knn),
        ('sgdc', sgdc),
        ('gbc', gbc)
    ]
)

voting_clf.fit(X_train_prepared, y_train)
y_pred_voting_clf = voting_clf.predict(X_valid_prepared)
acc_voting_clf = round(accuracy_score(y_pred_voting_clf, y_valid) * 100, 2)
acc_voting_clf

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

print('Precision score is: ', round(precision_score(y_valid, y_pred_voting_clf), 2))
print('Recall score is: ', round(recall_score(y_valid, y_pred_voting_clf), 2))
print('F1 score is: ', round(f1_score(y_valid, y_pred_voting_clf), 2))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

confusion_matrix_voting_clf = confusion_matrix(y_valid, y_pred_voting_clf)

cm_dataframe_voting_clf = pd.DataFrame(confusion_matrix_voting_clf, index=["False", "True"], columns=["False", "True"])
sns.heatmap(cm_dataframe_voting_clf, annot=True, annot_kws={"size": 18}, fmt="d")
plt.title("Voting Classifier")
plt.ylabel('Actual Classes')
plt.xlabel('Predicted Classes')
plt.show()

In [None]:
report_voting_clf = pd.DataFrame(classification_report(y_valid, y_pred_voting_clf,
                                                output_dict=True,
                                                target_names=["False", "True"]))
report_voting_clf

In [None]:
false_positive_rate_voting_clf, true_positive_rate_voting_clf, thresholds_voting_clf = roc_curve(y_valid, y_pred_voting_clf)

auc_voting_clf = auc(false_positive_rate_voting_clf, true_positive_rate_voting_clf)

plt.figure(figsize=(8, 8))
plt.plot(false_positive_rate_voting_clf, true_positive_rate_voting_clf, label="AUC = %0.2f"%auc_voting_clf)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.legend(loc='lower right')
plt.title("Receiver Operating Characteristic(ROC) for Voting Classifier")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(voting_clf, X_train_prepared, y_train, cv=5, scoring='accuracy')
print('Accuracy scores are:\n', scores)
print('Mean accuracy score is: ', scores.mean())

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50, 100, 200, 500,],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,4,5,6,7,8],
}

grid_search = GridSearchCV(rfc, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
rfc_gs = grid_search.best_estimator_

rfc_gs.fit(X_train_prepared, y_train)
y_pred_rfc_gs = rfc.predict(X_valid_prepared)
acc_rfc_gs = round(accuracy_score(y_pred_rfc_gs, y_valid) * 100, 2)
acc_rfc_gs

In [None]:
param_grid = {
    'penalty':['l1','l2'],
    'C':[0.1, 1, 10, 100, 1000]
}

grid_search = GridSearchCV(lr, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
lr_gs = grid_search.best_estimator_

lr_gs.fit(X_train_prepared, y_train)
y_pred_lr_gs = lr.predict(X_valid_prepared)
acc_lr_gs = round(accuracy_score(y_pred_lr_gs, y_valid) * 100, 2)
acc_lr_gs

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(svc, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
svc_gs = grid_search.best_estimator_

svc_gs.fit(X_train_prepared, y_train)
y_pred_svc_gs = svc.predict(X_valid_prepared)
acc_svc_gs = round(accuracy_score(y_pred_svc_gs, y_valid) * 100, 2)
acc_svc_gs

In [None]:
param_grid = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}

grid_search = GridSearchCV(knn, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
knn_gs = grid_search.best_estimator_

knn_gs.fit(X_train_prepared, y_train)
y_pred_knn_gs = knn_gs.predict(X_valid_prepared)
acc_knn_gs = round(accuracy_score(y_pred_knn_gs, y_valid) * 100, 2)
acc_knn_gs

In [None]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet']
}

grid_search = GridSearchCV(sgdc, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
sgdc_gs = grid_search.best_estimator_

sgdc_gs.fit(X_train_prepared, y_train)
y_pred_sgdc_gs = sgdc_gs.predict(X_valid_prepared)
acc_sgdc_gs = round(accuracy_score(y_pred_sgdc_gs, y_valid) * 100, 2)
acc_sgdc_gs

In [None]:
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
    "max_depth":[3,4,5,7],
    "max_features":["log2","sqrt"],
    "n_estimators":[10, 20, 50, 100, 500]
}

grid_search = GridSearchCV(gbc, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
gbc_gs = grid_search.best_estimator_

gbc_gs.fit(X_train_prepared, y_train)
y_pred_gbc_gs = gbc_gs.predict(X_valid_prepared)
acc_gbc_gs = round(accuracy_score(y_pred_gbc_gs, y_valid) * 100, 2)
acc_gbc_gs

In [None]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(n_estimators=1000, learning_rate=0.05)
xgbc.fit(X_train_prepared, y_train, 
             early_stopping_rounds=10, 
             eval_set=[(X_valid_prepared, y_valid)], 
             verbose=False)
y_pred_xgbc = xgbc.predict(X_valid_prepared)
acc_xgbc = round(accuracy_score(y_pred_xgbc, y_valid) * 100, 2)
acc_xgbc

In [None]:
voting_clf_gs = VotingClassifier(
    estimators=[
        ('rfc_gs', rfc_gs),
        ('lr_gs', lr_gs),
        ('svc_gs', svc_gs),
        ('knn_gs', knn_gs),
        ('sgdc_gs', sgdc_gs),
        ('gbc_gs', gbc_gs),
        ('xgbc', xgbc)
    ]
)

voting_clf_gs.fit(X_train_prepared, y_train)
y_pred_voting_clf_gs = voting_clf_gs.predict(X_valid_prepared)
acc_voting_clf_gs = round(accuracy_score(y_pred_voting_clf_gs, y_valid) * 100, 2)
acc_voting_clf_gs

In [None]:
scores_final = cross_val_score(voting_clf_gs, X_train_prepared, y_train, cv=5, scoring='accuracy')
print('Accuracy scores are:\n', scores_final)
print('Mean accuracy score is: ', scores_final.mean())

In [None]:
print('Precision score is: ', round(precision_score(y_valid, y_pred_voting_clf_gs), 2))
print('Recall score is: ', round(recall_score(y_valid, y_pred_voting_clf_gs), 2))
print('F1 score is: ', round(f1_score(y_valid, y_pred_voting_clf_gs), 2))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

confusion_matrix_voting_clf_gs = confusion_matrix(y_valid, y_pred_voting_clf_gs)

cm_dataframe_voting_clf_gs = pd.DataFrame(confusion_matrix_voting_clf_gs, index=["False", "True"], columns=["False", "True"])
sns.heatmap(cm_dataframe_voting_clf_gs, annot=True, annot_kws={"size": 18}, fmt="d")
plt.title("Voting Classifier")
plt.ylabel('Actual Classes')
plt.xlabel('Predicted Classes')
plt.show()

In [None]:
report_voting_clf = pd.DataFrame(classification_report(y_valid, y_pred_voting_clf_gs,
                                                output_dict=True,
                                                target_names=["False", "True"]))
report_voting_clf

In [None]:
false_positive_rate_voting_clf_gs, true_positive_rate_voting_clf_gs, thresholds_voting_clf_gs = roc_curve(y_valid, y_pred_voting_clf)

auc_voting_clf_gs = auc(false_positive_rate_voting_clf_gs, true_positive_rate_voting_clf_gs)

plt.figure(figsize=(8, 8))
plt.plot(false_positive_rate_voting_clf_gs, true_positive_rate_voting_clf_gs, label="AUC = %0.2f"%auc_voting_clf_gs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.legend(loc='lower right')
plt.title("Receiver Operating Characteristic(ROC) for Voting Classifier")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")

In [None]:
data_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
data_test_copy = data_test.copy(deep=True)

In [None]:
data_test_copy.Name.fillna(value='No_name', inplace=True)

In [None]:
surname = []
for name in data_test_copy.Name:
    a = name.split()
    surname.append(a[-1])
data_test_copy['Surname'] = surname

In [None]:
data_test_copy['Total Spending'] = data_test_copy['RoomService'] + data_test_copy['FoodCourt'] + data_test_copy['ShoppingMall'] + data_test_copy['Spa'] + data_test_copy['VRDeck']

In [None]:
data_test_copy.drop(labels=['PassengerId', 'Name'], axis=1, inplace = True)

In [None]:
data_test_prepared = preprocess_pipeline.transform(data_test_copy[num_attribs + cat_attribs])

In [None]:
data_test_prepared = pd.DataFrame(data_test_prepared, columns = data_test_copy.columns)

In [None]:
pred_data_test_prepared = voting_clf_gs.predict(data_test_prepared)
pred_data_test_prepared

In [None]:
submission = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Transported': pred_data_test_prepared})
submission

In [None]:
submission.to_csv("submission.csv", index=False)