In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
train_data.head()

In [None]:
def split_index(df):
    subset = df['PassengerId'].str.split('_', expand=True)
    subset.rename(columns={0: "Group", 1: "Prog"},inplace=True)
    return pd.concat([df, subset], axis = 1)

train_data = split_index(train_data)
test_data = split_index(test_data)

In [None]:
features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Name']

def print_null_values(dataset,features):
    row, column = dataset.shape 
    for feature in features:
        values = dataset[feature].count()
        missing_values = dataset[feature].isnull().sum()
        nan_values = dataset[feature].isna().sum()
        perc = (values / row) * 100
        print("{0} {1} {2} {3:.2f}%".format(feature,values,missing_values,perc))
    
print_null_values(train_data,features)

In [None]:
print_null_values(test_data,features)

In [None]:
def split_deck(df):
    subset = df['Cabin'].str.split('/', expand=True)
    subset.rename(columns={0: "Deck", 1: "Num",2:"Side"},inplace=True)
    return pd.concat([df, subset], axis = 1)

train_data = split_deck(train_data)
test_data = split_deck(test_data)

In [None]:
test_data_index = test_data['PassengerId']

In [None]:
def split_name(df):
    subset = df['Name'].str.split(' ', expand=True)
    subset.rename(columns={0: "First name", 1: "Last name"},inplace=True)
    return pd.concat([df, subset], axis = 1)


train_data = split_name(train_data)
test_data = split_name(test_data)

In [None]:
train_data = train_data.drop(['Cabin','Name'], axis=1)
test_data = test_data.drop(['Cabin', 'Name'], axis=1)
y_train = train_data.pop('Transported')

In [None]:
all_data = pd.concat([train_data, test_data], axis = 0)

In [None]:
all_data_cryo = all_data.dropna(subset=['CryoSleep']).copy()
all_data_cryo["VIP"].fillna(False,inplace=True)
all_data_cryo["VIP"] = all_data_cryo["VIP"].astype(int)
all_data_cryo["CryoSleep"] = all_data_cryo["CryoSleep"].astype(int)
all_data_cryo["Total Expenses"] = all_data_cryo["RoomService"].fillna(0) + all_data_cryo["FoodCourt"].fillna(0) + all_data_cryo["ShoppingMall"].fillna(0) + all_data_cryo["Spa"].fillna(0) + all_data_cryo["VRDeck"].fillna(0)
cols = ["CryoSleep","VIP","HomePlanet","Destination","Age","Total Expenses","Deck","Side"]
test_corr = all_data_cryo[cols]
test_corr =  pd.get_dummies(test_corr)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

corr = test_corr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corr, mask=mask,square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
all_data_cryo["Log Total Expenses"] = np.log10(all_data_cryo["Total Expenses"] + 1.0)
all_data_cryo.reset_index(inplace=True)

sns.histplot(data=all_data_cryo,hue='CryoSleep',x='Log Total Expenses')

In [None]:
test_cryo = all_data_cryo["Log Total Expenses"] < 1
test_cryo = test_cryo.astype(int)

print("CryoSleep accuracy {0:.2f}".format((all_data_cryo["CryoSleep"] == test_cryo).mean()))

In [None]:
all_data_cryo["VIP"].fillna(False,inplace=True)
def calclog_total_expenses(df):
    df["Total Expenses"] = df["RoomService"].fillna(0) + df["FoodCourt"].fillna(0) + df["ShoppingMall"].fillna(0) + df["Spa"].fillna(0) + df["VRDeck"].fillna(0)
    df["Log Total Expenses"] = np.log10(df["Total Expenses"] + 1.0)
    return df

train_data = calclog_total_expenses(train_data)
test_data = calclog_total_expenses(test_data)

In [None]:
train_data["CryoSleepCalc"] = train_data["Log Total Expenses"] < 1
train_data["CryoSleep"].fillna(train_data["CryoSleepCalc"], inplace=True)

test_data["CryoSleepCalc"] = test_data["Log Total Expenses"] < 1
test_data["CryoSleep"].fillna(test_data["CryoSleepCalc"], inplace=True)

train_data.drop('CryoSleepCalc', inplace=True, axis=1)
test_data.drop('CryoSleepCalc', inplace=True, axis=1)

In [None]:
test_data.head()

In [None]:
train_data["HomePlanet"].fillna("Earth", inplace=True)
test_data["HomePlanet"].fillna("Earth", inplace=True)

In [None]:
train_data["Destination"].fillna("TRAPPIST-1e", inplace=True)
test_data["Destination"].fillna("TRAPPIST-1e", inplace=True)

In [None]:
train_data["VIP"].fillna(False, inplace=True)
test_data["VIP"].fillna(False, inplace=True)

In [None]:
all_data_age = all_data.dropna(subset=['Age']).copy()

In [None]:
all_data_age.reset_index(inplace=True)
sns.histplot(data=all_data_age,hue='HomePlanet',x='Age')

In [None]:
grouped_ages = all_data_age[['HomePlanet', 'Age']].groupby(['HomePlanet']).agg(pd.Series.mode)
grouped_ages

In [None]:
median_ages = all_data_age['Age'].mode()
median_ages

In [None]:
def impute_age(dataset):
    for index, row in dataset.iterrows():
        home = row['HomePlanet']
        age = row['Age']
        if pd.isna(age):
            try:
                age = grouped_ages[home]
            except:
                age = median_ages
                
            dataset.at[index,'Age'] = age
            
impute_age(train_data)
impute_age(test_data)

In [None]:
train_data["Deck"].fillna("F", inplace=True)
test_data["Deck"].fillna("F", inplace=True)

In [None]:
train_data["Side"].fillna("S", inplace=True)
test_data["Side"].fillna("S", inplace=True)

In [None]:
def age_band(dataset,bins = 5):
    labels = range(bins)
    return pd.cut(dataset['Age'], bins=bins,labels=labels)

train_data['AgeBand'] = age_band(train_data,6).astype('int')
test_data['AgeBand'] = age_band(test_data,6).astype('int')

In [None]:
train_data_surname = pd.concat([train_data,y_train], axis = 1)
surname_map = train_data_surname.groupby(['Last name','Group'])['Transported'].agg(['count', 'mean'])

In [None]:
m=3

mean_survived = y_train.mean()
counts = surname_map['count']
means = surname_map['mean']

smooth = (counts * means + m * mean_survived) / (counts + m)

def family_info(dataset):
    for index, row in dataset.iterrows():
        surname = row['Last name']
        ticket = row['Group']
        
        try:
            survived = smooth[surname,ticket]
        except:
            survived = mean_survived
        
        dataset.at[index,'FamilySurvived'] = survived
        
family_info(train_data) 
family_info(test_data) 

In [None]:
interesting_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Deck', 'Side', 'Total Expenses', 'Log Total Expenses', 'AgeBand', 'FamilySurvived']

In [None]:
train_data = train_data[interesting_cols]
test_data = test_data[interesting_cols]

train_data["RoomService"].fillna(0, inplace=True)
train_data["FoodCourt"].fillna(0, inplace=True)
train_data["ShoppingMall"].fillna(0, inplace=True)
train_data["Spa"].fillna(0, inplace=True)
train_data["VRDeck"].fillna(0, inplace=True)

test_data["RoomService"].fillna(0, inplace=True)
test_data["FoodCourt"].fillna(0, inplace=True)
test_data["ShoppingMall"].fillna(0, inplace=True)
test_data["Spa"].fillna(0, inplace=True)
test_data["VRDeck"].fillna(0, inplace=True)

print_null_values(test_data,interesting_cols)

In [None]:
train_data = pd.get_dummies(train_data)
test_data =  pd.get_dummies(test_data)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

In [None]:
train_data['Age'] = mms.fit_transform(train_data["Age"].values.reshape(-1, 1))
test_data['Age'] = mms.transform(test_data["Age"].values.reshape(-1, 1))

In [None]:
train_data = train_data.drop(['Total Expenses'], axis=1)
test_data = test_data.drop(['Total Expenses'], axis=1)

In [None]:
train_data["VIP"] = train_data["VIP"].astype(int)
test_data["VIP"] = test_data["VIP"].astype(int)

train_data["CryoSleep"] = train_data["CryoSleep"].astype(int)
test_data["CryoSleep"] = test_data["CryoSleep"].astype(int)

In [None]:
train_data['RoomService'] = mms.fit_transform(train_data["RoomService"].values.reshape(-1, 1))
test_data['RoomService'] = mms.transform(test_data["RoomService"].values.reshape(-1, 1))

train_data['FoodCourt'] = mms.fit_transform(train_data["FoodCourt"].values.reshape(-1, 1))
test_data['FoodCourt'] = mms.transform(test_data["FoodCourt"].values.reshape(-1, 1))

train_data['ShoppingMall'] = mms.fit_transform(train_data["ShoppingMall"].values.reshape(-1, 1))
test_data['ShoppingMall'] = mms.transform(test_data["ShoppingMall"].values.reshape(-1, 1))

train_data['Spa'] = mms.fit_transform(train_data["Spa"].values.reshape(-1, 1))
test_data['Spa'] = mms.transform(test_data["Spa"].values.reshape(-1, 1))

train_data['VRDeck'] = mms.fit_transform(train_data["VRDeck"].values.reshape(-1, 1))
test_data['VRDeck'] = mms.transform(test_data["VRDeck"].values.reshape(-1, 1))

train_data

In [None]:
X_train = train_data.copy()
X_test = test_data.copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

treshold = 0.01


forest = RandomForestClassifier(n_estimators = 500)
forest.fit(X_train,y_train)

importances = forest.feature_importances_
indexes = np.argsort(importances)[::-1]

columns = []


for i,f in enumerate(indexes):
    importance = importances[f]
    if importance >= treshold:
        column = X_train.columns[f]
        columns.append(column)
        print("{0:2} {1:30} {2:.2f}".format(i+1,column,importances[f]))

In [None]:
X_train = X_train[columns]
X_test = X_test[columns]

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]

param_grid = [{'C': param_range, 
               'kernel': ['linear']},
              {'C': param_range, 
               'gamma': param_range, 
               'kernel': ['rbf']}]

model = SVC()

gs = GridSearchCV(estimator=model, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  cv=10,
                  n_jobs=-1)



start = time.time()
gs = gs.fit(X_train, y_train)
print(f'Time: {time.time() - start}')

print("Model accuracy {0:.2f}%".format(gs.best_score_ * 100))
print(gs.best_params_)

In [None]:
predictions = gs.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data_index, 'Transported': predictions.astype(bool)})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")