In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid forspecial VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
Name - The first and last names of the passenger.

Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum()

In [None]:
train_df['Transported'].value_counts().plot.bar(color=['green','red'])

In [None]:
train_df['Destination'].value_counts().plot.pie()

In [None]:
train_df['HomePlanet'].value_counts().plot.pie()

In [None]:
train_df['Age'].plot.hist()

In [None]:
# how many of VIP got transported or not
import seaborn as sns

vip_transported = len(train_df.query('VIP==True and Transported==True'))
vip = len(train_df[train_df.VIP == True])
print('VIP transported:',round((vip_transported / vip)*100,3),'%')
sns.countplot(x=train_df['VIP'], hue=train_df['Transported'])

In [None]:
# cryosleep transported/not

sns.countplot(x=train_df['CryoSleep'], hue=train_df['Transported'])

### filling na values 

- numerical col missing values will be filled:
    - mean for age
    - 0 for pricing
- categorical col missing values will be filled with the most occuring

In [None]:
# pricing
pricing_cols = ['RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck']
for col in pricing_cols:
    train_df[col].fillna(0, inplace=True)
    test_df[col].fillna(0, inplace=True)
    
# age
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

def fill_with_most_occuring(df, categories):
    for category in categories:
        top = df[category].value_counts().index.tolist()[0]
        df[category].fillna(top, inplace=True)
        
    return df

category_cols = ['HomePlanet','CryoSleep','Destination','VIP']
train_df = fill_with_most_occuring(train_df, category_cols)
test_df = fill_with_most_occuring(test_df, category_cols)

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
# handling Cabin column -- which is of the form deck/num/side

def split_cabin(df):
    df[['Deck','Num','Side']] = df['Cabin'].str.split('/', expand=True)
    df['Num'] = df['Num'].astype(float) # convert Num str to Num float64
    df['Num'].fillna(0, inplace=True) # fill missing Num values with 0
    for category in ['Deck','Side']: # fill with most occuring
        top = df[category].value_counts().index.tolist()[0]
        df[category].fillna(top, inplace=True)
    df.drop('Cabin', axis=1, inplace=True)
    return df

train_df = split_cabin(train_df)
test_df = split_cabin(test_df)

In [None]:
# dropping unneccessary cols

train_df.drop(['Name', 'PassengerId'], axis=1, inplace=True)
test_df.drop(['Name'], axis=1, inplace=True)

In [None]:
sns.countplot(x=train_df['Side'], hue=train_df['Transported'])

In [None]:
sns.countplot(x=train_df['Deck'], hue=train_df['Transported'])

In [None]:
test_df.head()

### Label Encoding

In [None]:
non_numerical_categories = list(train_df.select_dtypes(include=['object','bool']).columns)
print(non_numerical_categories)

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(train_df, test_df, categories):
    le = LabelEncoder()
    for cat in categories:
        le.fit(train_df[cat])
        print(le.classes_)
        train_df[cat] = le.transform(train_df[cat])
        test_df[cat] = le.transform(test_df[cat])
        
    return train_df, test_df

train_df, test_df = encode_labels(train_df, test_df, ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'])

#### classes, index = encoded value

HomePlanet: ['Earth' 'Europa' 'Mars']

Cryosleep: [False  True]

Destination: ['55 Cancri e' 'PSO J318.5-22' 'TRAPPIST-1e']

VIP: [False  True]

Deck: ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']

Side: ['P' 'S']

In [None]:
train_df['Transported'] = train_df['Transported'].map({True: 1, False: 0})

In [None]:
train_df.head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15,15))
sns.heatmap(train_df.corr(), annot=True)

In [None]:
# train scaling

numerical_cols = ['Age','RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck']
for col in numerical_cols:
    train_df[col] = train_df[col] / train_df[col].max()

In [None]:
# test scaling
for col in numerical_cols:
    test_df[col] = test_df[col] / test_df[col].max()

In [None]:
test_passengerIds = test_df['PassengerId'].values
y= train_df['Transported'].values
X = train_df.drop(['Transported'], axis=1).values
X_test = test_df.drop(['PassengerId'], axis=1).values
print(X_train.shape, y_train.shape)
print(X_test.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=1357)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val. shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

rfc = RandomForestClassifier()
svc = SVC(gamma='auto')
knn = KNeighborsClassifier()

rfc.fit(X_train,y_train)
svc.fit(X_train,y_train)
knn.fit(X_train,y_train)

y_val_pred_rfc = rfc.predict(X_val)
y_val_pred_svc = svc.predict(X_val)
y_val_pred_knn = knn.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score

print("Random Forests")
print(f'training accuracy: {accuracy_score(y_train, rfc.predict(X_train))}')
print(f"validation accuracy: {accuracy_score(y_val, y_val_pred_rfc)}")

print("Support Vector Machines")
print(f'training accuracy: {accuracy_score(y_train, svc.predict(X_train))}')
print(f"validation accuracy: {accuracy_score(y_val, y_val_pred_svc)}")

print("K-Nearest Neighbors")
print(f'training accuracy: {accuracy_score(y_train, knn.predict(X_train))}')
print(f"validation accuracy: {accuracy_score(y_val, y_val_pred_knn)}")

In [None]:
# predictions

y_test_pred_rfc = rfc.predict(X_test)
y_test_pred_svc = svc.predict(X_test)
y_test_pred_knn = knn.predict(X_test)

In [None]:
# submission df


submission_df = pd.DataFrame({'PassengerId': test_passengerIds, 'Transported': y_test_pred_rfc})
submission_df['Transported'] = submission_df['Transported'].astype(bool)
submission_df.to_csv('submission_rfc.csv', index=False)

submission_df2 = pd.DataFrame({'PassengerId': test_passengerIds, 'Transported': y_test_pred_svc})
submission_df2['Transported'] = submission_df['Transported'].astype(bool)
submission_df2.to_csv('submission_svc.csv', index=False)

submission_df3 = pd.DataFrame({'PassengerId': test_passengerIds, 'Transported': y_test_pred_knn})
submission_df3['Transported'] = submission_df['Transported'].astype(bool)
submission_df3.to_csv('submission_knn.csv', index=False)
