In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
train.columns

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid for special VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

Name - The first and last names of the passenger.

Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
train.head(5)

In [None]:
import seaborn as sns
import plotly.express as px

In [None]:
px.histogram(train,x="Transported")

**We have a fairly balanced dataset**

In [None]:
px.histogram(train,x="HomePlanet",color="Transported")

In [None]:
px.histogram(train,x="CryoSleep",color="Transported")

**Most of the passengers with cryosleep True are transported**

In [None]:
px.histogram(train,x="Cabin",color="Transported")

**All the cabins to the right seem to transport people to the other dimension, we could analyse and try to find some pattern ofcourse without overfitting** 

In [None]:
px.histogram(train,x="Destination",color="Transported")

In [None]:
px.histogram(train,x="VIP",color="Transported")

**We have very little VIP data in our train set, will check correlation with transported to see if the variable is important**

In [None]:
px.line(train,y="RoomService",color="Transported")

**Room service seems to be higher for individuals that were not transported**

In [None]:
px.line(train,y="FoodCourt",color="Transported")

**Food court seems to be higher for individuals that were transported**

In [None]:
px.line(train,y="ShoppingMall",color="Transported")

In [None]:
px.line(train,y="Spa",color="Transported")

**Spa costs for a person who was not transported seems to be much higher than a person who was**

In [None]:
px.line(train,y="VRDeck",color="Transported")

**VRDeck costs for a person who was not transported seems to be much higher than a person who was**

In [None]:
import matplotlib.pyplot as plt
corr = train.corr()
sns.heatmap(corr)

**For the baseline model we will drop person id and their names**

In [None]:
train = train.drop(["PassengerId","Name"],axis=1)
test = test.drop(["PassengerId","Name"],axis=1)

In [None]:
train.columns

In [None]:
train["HomePlanet"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train["transform_HomePlanet"] = le.fit_transform(train["HomePlanet"].astype(str))
test["transform_HomePlanet"] = le.transform(test["HomePlanet"].astype(str))
train["transform_CryoSleep"] = le.fit_transform(train["CryoSleep"].astype(str))
test["transform_CryoSleep"] = le.transform(test["CryoSleep"].astype(str))

train["transform_Destination"] = le.fit_transform(train["Destination"].astype(str))
test["transform_Destination"] = le.transform(test["Destination"].astype(str))
train["transform_VIP"] = le.fit_transform(train["VIP"].astype(str))
test["transform_VIP"] = le.transform(test["VIP"].astype(str))



In [None]:
train = train.drop(["HomePlanet","CryoSleep","Destination","VIP","Cabin"],axis=1)
test = test.drop(["HomePlanet","CryoSleep","Destination","VIP","Cabin"],axis=1)

In [None]:
y = train["Transported"]
train = train.drop(["Transported"],axis=1)

In [None]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Models
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

In [None]:
# Define model
model=XGBClassifier(random_state=5)

# Parameters grid
param_grid = {'n_estimators': [10, 25, 50, 75, 100],
        'learning_rate': [0.2, 0.15, 0.1, 0.05],
             'eval_metric': ['mlogloss']}

# Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=5,scoring='accuracy')

# Grid Search
grid_model = GridSearchCV(model,param_grid,cv=kf)

# Train classifier with optimal parameters
grid_model.fit(train,y)

In [None]:
y_pred = grid_model.predict(test)
predictions = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
predictions["Transported"] = y_pred

In [None]:
predictions.to_csv("baseline_submission.csv",index=False)