In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

**Filling Null Values**

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
# checking for null values
print(train_df.isnull().sum(), '\n')
test_df.isnull().sum()

In [None]:
# filling null values for columns containing categorical variables
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

for col in cat_cols:
    train_df[col].fillna(train_df[col].mode(), inplace = True)
    test_df[col].fillna(test_df[col].mode(), inplace = True)

Filling of null values in the categorical columns was important as *pd.get_dummies* (which will be used after this) fills them with 0, by default. This can lead to misleading results.

In [None]:
# converting categorical variables into dummy variables for train set
home_train_new = pd.get_dummies(train_df['HomePlanet'], drop_first = True)
cryo_train_new = pd.get_dummies(train_df['CryoSleep'], drop_first = True)
dest_train_new = pd.get_dummies(train_df['Destination'], drop_first = True)
vip_train_new = pd.get_dummies(train_df['VIP'], drop_first = True)
trans_train_new = pd.get_dummies(train_df['Transported'], drop_first = True)
train_df = pd.concat([train_df, home_train_new, cryo_train_new, dest_train_new, vip_train_new, trans_train_new], axis = 1)
train_df.drop(train_df.columns[[1, 2, 3, 4, 6, 12, 13, 14, 17]], axis = 1, inplace = True)
train_df.columns = ['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported']

In [None]:
# converting categorical variables into dummy variables for test set
home_test_new = pd.get_dummies(test_df['HomePlanet'], drop_first = True)
cryo_test_new = pd.get_dummies(test_df['CryoSleep'], drop_first = True)
dest_test_new = pd.get_dummies(test_df['Destination'], drop_first = True)
vip_test_new = pd.get_dummies(test_df['VIP'], drop_first = True)
test_df = pd.concat([test_df, home_test_new, cryo_test_new, dest_test_new, vip_test_new], axis = 1)
test_df.drop(test_df.columns[[1, 2, 3, 4, 6, 12, 13, 16]], axis = 1, inplace = True)
test_df.columns = ['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In the above two cells, we created dataframes with dummy variables for the columns HomePlanet, CryoSleep, Destination, VIP and, Transported. Then we concatenated them to their respective dataframes and deleted the extra columns that were created, along with the columns: Name and Cabin. We finally renamed the columns as some had the same name.

In [None]:
# filling null values of numeric variables with median scores for both train and test sets
cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in cols:
    train_df[col].fillna(np.nanmedian(train_df[col]), inplace = True)
    test_df[col].fillna(np.nanmedian(test_df[col]), inplace = True)

No more null values are now remaining.

In [None]:
train_df.head()

In [None]:
test_df.head()

We plot a correlation heatmap for checking relation between all the variables.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df_num = train_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported']]
dataplot = sns.heatmap(df_num.corr(), annot = True)
plt.show

We can see that most of the features are non-correlated, as their correlation coefficients lie very close to zero.

**Removing Outliers**

We first make boxplots using the columns containing numeric variables, to get a rough estimate of the outliers are present.

In [None]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

for col in num_cols:
    sns.boxplot(x = 'Transported', y = col, data = train_df)
    plt.show()

In [None]:
sns.boxplot(data = df_num, orient = 'h')
plt.xticks(rotation = 45)
plt.show()

We now run a loop that sets a column-wise condition on the train and test sets, and creates a new dataframe containing only the outliers. This dataframe will then be used to as a filter to create the final, clean datasets.

In [None]:
# the outliers will be removed using the interquartile range (IQR)
for col in num_cols:
    Q1 = train_df[col].quantile(0.10)
    Q3 = train_df[col].quantile(0.90)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5*IQR)
    upper_bound = Q3 + (1.5*IQR)

    temp_train = train_df[(train_df[col] < lower_bound) | (train_df[col] > upper_bound)]         # creates a dataframe containing the outliers

train_df = pd.merge(train_df, temp_train, indicator = True, how = 'outer').query('_merge == "left_only"').drop('_merge', axis = 1)

The datasets are now clean. They can now be used for modelling purposes.

**Modelling**

In [None]:
# importing required libraries for modelling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# splitting the dataframes into train and test sets
X = train_df.drop(['Transported', 'PassengerId'], axis = 1).values
y = train_df['Transported'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

# preparing the test data for prediction
test_data = test_df.drop('PassengerId', axis = 1)

In [None]:
# creating a common function to run all models
def common(model):
    cv = cross_val_score(model, X_train, y_train, cv = 10)
    print('Cross Validation Score: {}'.format(cv))
    print('Mean Cross Validation Score: {}'.format(cv.mean()))
    
    # fitting the classifier to the training data
    model.fit(X_train, y_train)
    
    # predicting the labels of the test set: y_pred
    y_pred = model.predict(X_test)
    
    # calculating testing accuracy of model
    model.score(X_test, y_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: ', accuracy)
    
    # creating submission predictions
    predictions = model.predict(test_data.values)
    
    return predictions

In [None]:
knn_pred = common(KNeighborsClassifier(n_neighbors = 50))

In [None]:
logreg_pred = common(LogisticRegression())

In [None]:
xgb_pred = common(XGBClassifier())

In [None]:
rf_pred = common(RandomForestClassifier(n_estimators = 400))

We can see that among the above models, LogisticRegression offers the best accuracy and CV score. Hence, we use that for creating our final predictions.

In [None]:
# preparing the submission csv file
df = pd.DataFrame({'PassengerId':test_df['PassengerId'], 'Transported':logreg_pred})

# the values of the Transported columns are in 0s and 1s, so we convert them back to False and True form, respectively
df['Transported'] = df['Transported'].replace([0], False)
df['Transported'] = df['Transported'].replace([1], True)

df.to_csv('submission.csv', index = False)