In [None]:
#Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import learning_curve
import numpy as np
from sklearn.metrics import accuracy_score
import random
from scipy import stats
import os
from IPython.core.display import HTML

In [None]:
seed=1337
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(seed)

In [None]:
#loading dataset
df = pd.read_csv('../input/spaceship-titanic/train.csv')

In [None]:
#checking 1st 5 rows
df.head()

In [None]:
#checking shape of dataset
df.shape

In [None]:
#checking info
df.info()

In [None]:
#checking distribution
df.describe()

In [None]:
print("Unique Home Planet values and their counts :")
hp = df.pivot_table(index = ['HomePlanet'], aggfunc = 'size') 
hp = hp.reset_index()
hp.columns= ["HomePlanet", "Counts"]
hp = hp.sort_values("Counts", ascending = False)
print(hp)

In [None]:
px.bar(hp, x="HomePlanet", y="Counts", title="Distinct Count of Planet", color="HomePlanet")

Observation:
1. Most are from Earth home planet

In [None]:
print("Unique Cryosleep values and their counts :")
cs = df.pivot_table(index = ['CryoSleep'], aggfunc = 'size') 
cs = cs.reset_index()
cs.columns= ['CryoSleep', "Counts"]
cs = cs.sort_values("Counts", ascending = False)
print(cs)

In [None]:
px.bar(cs, x="CryoSleep", y="Counts", title="Distinct Count of Cryosleep", color="CryoSleep")

In [None]:
print("Unique Destination values and their counts :")
dt = df.pivot_table(index = ['Destination'], aggfunc = 'size') 
dt = dt.reset_index()
dt.columns= ['Destination', "Counts"]
dt = dt.sort_values("Counts", ascending = False)
print(dt)

In [None]:
px.bar(dt, x="Destination", y="Counts", title="Distinct Count of Destination", color="Destination")

Observation:
1. most passengers wanted to visit TRAPIST-1e

In [None]:
#Vizualizing player's height by histogram
px.histogram(df, x="Age", title="Passenger's Age Distribution", nbins=10, color_discrete_sequence=["coral"])

Observation
1. Most passengers are age of 15-34 and there are very less number of old age passengers

In [None]:
VIP = df['VIP'].value_counts().reset_index()
VIP.rename(columns = {'index' : 'T/F', 'kickType' : 'Count'}, inplace=True)
VIP

In [None]:
fig= px.pie(VIP, values='VIP', names='T/F', title='VIP or not',width=500,height=400)
fig.show()

Observation
1. Very less people are VIP

In [None]:
print("Unique Transported values and their counts :")
tp = df.pivot_table(index = ['Transported'], aggfunc = 'size') 
tp = tp.reset_index()
tp.columns= ['Transported', "Counts"]
tp = tp.sort_values("Counts", ascending = False)
print(tp)

In [None]:
px.bar(tp, x="Transported", y="Counts", title="Distinct Count of Transported", color="Transported")

In [None]:
fig= px.pie(tp, values='Counts', names='Transported', title='Transported')
fig.show()

In [None]:
sns.barplot(x = 'HomePlanet',y = 'Transported',hue = 'HomePlanet',data = df)

Passengers from europa are the most Transported in the trip

In [None]:
px.imshow(df.corr(), text_auto = '.2f', color_continuous_scale='BuPu')

In [None]:
#checking null values
df.isnull().sum()

Observation
1. All columns have some null values except PassengerID and Transported

# Filling Missing Data

In [None]:
#Filling missing values
imp = SimpleImputer(strategy="mean")

df[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = pd.DataFrame(imp.fit_transform(df[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]))

imp_m = SimpleImputer(strategy="most_frequent")
df[['HomePlanet','CryoSleep','Cabin','Destination','VIP']] = pd.DataFrame(imp_m.fit_transform(df[['HomePlanet','CryoSleep','Cabin','Destination','VIP']]))

df.isna().any()

In [None]:
X = df.drop(columns =['PassengerId','Name','Destination','Transported','Cabin'],axis = 1)
y = df['Transported']
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler_s = StandardScaler()

## Splitting Dataset

In [None]:
splittus=True
#Splittus value is used to either split by train and validation, or not for final training
y=df['Transported']
y=y.astype(int)
print(df.columns)
if splittus:
    from sklearn.model_selection import train_test_split

    
    X = pd.get_dummies(X,columns=['HomePlanet','CryoSleep','VIP'])

    X[['RoomService','Age','FoodCourt', 'ShoppingMall','Spa','VRDeck']] = pd.DataFrame(scaler_s.fit_transform(
    X[['RoomService','Age','FoodCourt', 'ShoppingMall','Spa','VRDeck']]))
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=seed,shuffle=True)
else:
    X_train=df.drop(columns=['Transported','PassengerId'])
    y_train=y

## Applying Model

In [None]:
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=1000,#random_strength=0.1,cat_features=cats
                         eval_metric='Accuracy',
                        verbose=0)
if splittus:
    model.fit(X_train,y_train,eval_set=(X_val,y_val),use_best_model=True)
else:
    model.fit(X_train,y_train)
print(model.get_best_iteration())
print(model.random_seed_)
print(model.learning_rate_)

In [None]:
if splittus:
    from sklearn.metrics import accuracy_score
    out=model.predict(X_val)
    out_t=model.predict(X_train)
    
    print(accuracy_score(y_train.values,out_t))
    print(accuracy_score(y_val.values,out))

# Confusion Matrix

In [None]:
test =pd.read_csv("../input/spaceship-titanic/test.csv")
test.head()

In [None]:
imp = SimpleImputer(strategy="mean")

test[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = pd.DataFrame(imp.fit_transform(test[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]))

imp_m = SimpleImputer(strategy="most_frequent")
test[['HomePlanet','CryoSleep','Cabin','Destination','VIP']] = pd.DataFrame(imp_m.fit_transform(test[['HomePlanet','CryoSleep','Cabin','Destination','VIP']]))

test.isna().any()

In [None]:
test_n = test.drop(columns =['PassengerId','Name','Destination','Cabin'],axis = 1)
test_n = pd.get_dummies(test_n,columns=['HomePlanet','CryoSleep','VIP'])

In [None]:
y_pred=model.predict(test_n)
y_pred.shape

In [None]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported":y_pred
})
submission.to_csv('titanic spaceship.csv',index=False)

In [None]:
submission