'''Welcome to the year 2912, where your data science skills are needed to solve a cosmic mystery. 
We've received a transmission from four lightyears away and things aren't looking good.
The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, 
the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime 
anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of 
the passengers were transported to an alternate dimension!

To help rescue crews and retrieve the lost passengers, you are challenged to predict which passengers were transported by the anomaly using records 
recovered from the spaceships damaged computer system.  Help save them and change history!'''

In [None]:
import pandas as pd
import numpy as np
import sklearn
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
train = pd.read_csv(r'../input/spaceship-titanic/train.csv')
test = pd.read_csv(r'../input/spaceship-titanic/test.csv')
pID=train['PassengerId']

print(train.isnull().sum()) # check for missing values
print(test.isnull().sum()) 

In [None]:
# Lots of edits copied from OPAMUSORA's notebook
cats=['HomePlanet','Cabin','Destination','CryoSleep','VIP','Name']

train['RoomService']=train['RoomService'].fillna(0)
train['FoodCourt']=train['FoodCourt'].fillna(0)
train['ShoppingMall']=train['ShoppingMall'].fillna(0)
train['Spa']=train['Spa'].fillna(0)
train['VRDeck']=train['VRDeck'].fillna(0)

test['RoomService']=test['RoomService'].fillna(0)
test['FoodCourt']=test['FoodCourt'].fillna(0)
test['ShoppingMall']=test['ShoppingMall'].fillna(0)
test['Spa']=test['Spa'].fillna(0)
test['VRDeck']=test['VRDeck'].fillna(0)


for i in train.columns:
    if train[i].isna().sum()>0:

        if i not in cats:
            train[i]=train[i].fillna(train.groupby('Transported')[i].transform('mean'))
for i in test.columns:
    if test[i].isna().sum()>0:

        if i not in cats:
            test[i]=test[i].fillna(test[i].mean())

# EDA
EDA repurposed from Titanic - Machine Learning from Disaster, Akshay Nevrekar


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

fig , ax = plt.subplots(figsize=(12,8))
sns.countplot(x='Transported', data=train)
plt.title("Count of Transported")
plt.show()

In [None]:
print(train.columns)
cats=['HomePlanet','CryoSleep', 'Cabin', 'Destination', 'VIP']
num=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

corr_df=train[num]  #New dataframe to calculate correlation between numeric features
cor= corr_df.corr(method='pearson')
print(cor)

In [None]:
fig, ax =plt.subplots(figsize=(16, 12))
plt.title("Correlation Plot")
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

In [None]:
from scipy.stats import chi2_contingency
csq=chi2_contingency(pd.crosstab(train['Transported'], train['HomePlanet']))
print("P-value: ",csq[1])

In [None]:
csq=chi2_contingency(pd.crosstab(train['Transported'], train['CryoSleep']))
print("P-value: ",csq[1])

In [None]:
csq=chi2_contingency(pd.crosstab(train['Transported'], train['Cabin']))
print("P-value: ",csq[1])

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='Transported', data=train, hue='CryoSleep')
plt.title("Impact of CryoSleep on Transported")
plt.show()

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='Transported', data=train, hue='HomePlanet')
plt.title("Impact of Home Planet on Transported")
plt.show()

In [None]:
train['Cabin']=train['Cabin'].fillna(method='ffill')
test['Cabin']=test['Cabin'].fillna(method='ffill')

# based on other user, split cabin for more accuracy
# split cabin into deck, num, and side
train['Deck'] = train['Cabin'].apply(lambda x: x.split('/')[0])
train['Num'] = train['Cabin'].apply(lambda x: x.split('/')[1])
train['Side'] = train['Cabin'].apply(lambda x: x.split('/')[2])

test['Deck'] = test['Cabin'].apply(lambda x: x.split('/')[0])
test['Num'] = test['Cabin'].apply(lambda x: x.split('/')[1])
test['Side'] = test['Cabin'].apply(lambda x: x.split('/')[2])

In [None]:
csq=chi2_contingency(pd.crosstab(train['Transported'], train['Deck']))
print("Deck  P-value: ",csq[1])

csq=chi2_contingency(pd.crosstab(train['Transported'], train['Num']))
print("Num  P-value: ",csq[1])

csq=chi2_contingency(pd.crosstab(train['Transported'], train['Side']))
print("Side  P-value: ",csq[1])

In [None]:
train['CryoSleep']=train['CryoSleep'].fillna(False)
test['CryoSleep']=test['CryoSleep'].fillna(False)

In [None]:
train['group']=train['PassengerId'].apply(lambda x: x.split('_')[0])
test['group']=test['PassengerId'].apply(lambda x: x.split('_')[0])

In [None]:
train['Name']=train['Name'].fillna(method='ffill')
test['Name']=test['Name'].fillna(method='ffill')
temp=pd.DataFrame(train.groupby(['group'])['Name'])
d={}
for i in range(len(temp)):#len(temp)
    past_last_names=[]
    names=list(temp[1][i])
    rltvs=1
    for j in range(len(list(temp[1][i]))):#len(list(temp[1][i]))
        if(names[j].split(' ')[1] in past_last_names):
            rltvs+=1
        past_last_names.append(names[j].split(' ')[1])

    d[f"{temp[0][i]}"]=rltvs

train['has_relatives']=train['group'].map(d)


temp=pd.DataFrame(test.groupby(['group'])['Name'])
d={}
for i in range(len(temp)):#len(temp)
    past_last_names=[]
    names=list(temp[1][i])
    rltvs=1
    for j in range(len(list(temp[1][i]))):#len(list(temp[1][i]))
        if(names[j].split(' ')[1] in past_last_names):
            rltvs+=1
        past_last_names.append(names[j].split(' ')[1])
    d[f"{temp[0][i]}"]=rltvs

test['has_relatives']=test['group'].map(d)
print(train)
del train['Name'],train['group']
del test['Name'],test['group']

In [None]:
train['ttl_spnd']=train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']
test['ttl_spnd']=test['RoomService']+test['FoodCourt']+test['ShoppingMall']+test['Spa']+test['VRDeck']

In [None]:
train['Adult']=True
train.loc[train['Age']<18, 'Adult']=False
test['Adult']=True
test.loc[test['Age']<18, 'Adult']=False

In [None]:
del train['Cabin'],test['Cabin']
cats.remove('Cabin')
cats.append('Deck')
cats.append('Num')
cats.append('Side')

print(cats)

In [None]:
for i in cats:
    print(i)
    le=LabelEncoder()
    arr=np.concatenate((train[i], test[i])).astype(str)
    le.fit(arr)
    train[i]=le.transform(train[i].astype(str))
    test[i]=le.transform(test[i].astype(str))

In [None]:
# i am not sure why this was included
# t=train['Transported']
# del train['Transported']
# train['Transported']=t

In [None]:
t1=train#.drop(columns=cats)

sns.set_theme(font_scale=3)
plt.figure(figsize=(50,50))
sns.heatmap(t1.corr(), annot=True)
plt.show()

In [None]:
y=train['Transported']
y=y.astype(int)
#cats.remove('VIP')
print(train.columns)
seed = 4062022

from sklearn.model_selection import train_test_split

X=train.drop(columns=['Transported','PassengerId'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=seed,shuffle=True)


In [None]:
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=1000,#random_strength=0.1,cat_features=cats
                         eval_metric='Accuracy',
                        verbose=0)

model.fit(X_train,y_train,eval_set=(X_val,y_val),use_best_model=True)
print(model.get_best_iteration())
print(model.random_seed_)
print(model.learning_rate_)

from sklearn.metrics import accuracy_score
out=model.predict(X_val)
out_t=model.predict(X_train)

print(accuracy_score(y_train.values,out_t))
print(accuracy_score(y_val.values,out))

In [None]:
test=test.drop(columns=['PassengerId'])

ans=model.predict(test)
ans=ans.astype(bool)

sub=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sub['Transported']=ans
sub.to_csv('./submission.csv',index=False)