In [None]:
import matplotlib.pyplot as plt
import zipfile
import pandas as pd
import numpy as np
import seaborn as sb
import tensorflow as tf
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
train.shape, test.shape

In [None]:
train.info()

# Imputation of null Values:
For imputation we will try to fill the null values by relating it with other available features instead of just filling the null entry's by mean or mode as far as possible to get better results.

In [None]:
# Checking the common logic is followed by our data is not that a VIP person's expenditure 
# is generally expected to be higher than that of the non-VIP person.

col = list(train.loc[:,'RoomService':'VRDeck'].columns.values)
train.groupby('VIP')[col].mean()

### Passenger in CryoSleep are confined to their cabins and suspended in the animation during the whole voyage so, they won't be able to spend on the services available onboard.

In [None]:
train.groupby('CryoSleep')[col].mean()

### Two important observations:
* #### As expected VIP people are tend to spend more as compared to the non-VIP people so, while performing the imputations we have to give proper care to this relation.
* #### If a person is in CryoSleep his/her expenditure is equal to zero.

In [None]:
train.loc[train['CryoSleep']==True, col] = 0.0
test.loc[test['CryoSleep']==True, col] = 0.0

In [None]:
for c in col:
  for val in [True, False]:
    k = train[train['VIP']==val][c].mean()
    train.loc[train['VIP']==val, c] = train.loc[train['VIP']==val, c].fillna(k)
    k = test[test['VIP']==val][c].mean()
    test.loc[test['VIP']==val, c] = test.loc[test['VIP']==val, c].fillna(k)

In [None]:
sb.countplot(data=train, x='VIP', hue='HomePlanet')

In [None]:
train[train['Destination']=='TRAPPIST-1e'].shape

Observation to fill the HomePlanet:
* If a person is not an VIP the probability of coming from Earth is high else he/she is from Europa.

Observation to fill the Destination:
* For around 6000 people have their destinaton as 'Trappist-1e' so, we will impute the null values by this as well. 

In [None]:
def dest_filler(data):
  c = 'Destination'
  data[c] = data[c].fillna('TRAPPIST-1e')
  return data

train = dest_filler(train)
test = dest_filler(test)

In [None]:
def planet_Filler(data):
  c = 'HomePlanet'
  data.loc[data['VIP']==False, c] = data.loc[data['VIP']==False, c].fillna('Earth')
  data.loc[data['VIP']==True, c] = data.loc[data['VIP']==True, c].fillna('Europa')
  return data

train = planet_Filler(train)
test = planet_Filler(test)

In [None]:
sb.boxplot(train['Age'])

So, there are some outliers in the data before imputing the age column we will remove the outlier and then impute by the mean.

In [None]:
def age_filler(data):
  k = data[data['Age']<61]['Age'].mean()
  data['Age'] = data['Age'].fillna(k)
  return data

train = age_filler(train)
test = age_filler(test)

In [None]:
train.head()

In [None]:
def seperator_PassengerId(data):
  new = data["PassengerId"].str.split("_", n = 1, expand = True)
  data["RoomNo"]= new[0].astype(int)
  data["PassengerNo"]= new[1].astype(int)
  data = data.drop(['PassengerId', 'Name'],axis=1)
  return data

train = seperator_PassengerId(train)
test = seperator_PassengerId(test)

In [None]:
for i in range(train.shape[0]):
  train['PassengerNo'][i] = (train['RoomNo'] == train['RoomNo'][i]).sum()

for i in range(test.shape[0]):
  test['PassengerNo'][i] = (test['RoomNo'] == test['RoomNo'][i]).sum()

#### Now let's drop the RoomNo data and plot the graph of number of people sharing room in each category.

In [None]:
train.drop(['RoomNo'], axis=1)
sb.countplot(data=train, x = 'PassengerNo', hue='VIP')
plt.show()

#### It is clear that VIP people do not take room in the sharing commodity.

In [None]:
train.isnull().sum().plot.bar()
plt.show()

In [None]:
sb.countplot(data=train, x='Transported', hue='CryoSleep')

### Here, we can observe that if the person is in CryoSleep then it is most probable that he/she is get transported and vice-versa.

#### Naive filler if any how we were unable to address some cases of imputations.

In [None]:
train.isnull().sum().plot.bar()

We will fill the 'CryoSleep' and the 'VIP' colum by mode value.
Note:-
Runing the nave imputation method on all columns to ensure the handling of those cases which get missed above(which will be very less) are handled now so, that we can train our model without any error.

In [None]:
for col in train.columns:
  if col == 'Transported':
    continue
  if train[col].dtype=='object' or train[col].dtype=='bool':
    train[col] = train[col].fillna(train[col].mode()[0])    
    test[col] = test[col].fillna(test[col].mode()[0])
  else:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())

To avoid the train test contamination it is the best practice to impute select the mean or mode value to be filled should be from train data if filling is to be done in the same and similarly for test data.

### From the entry's of the Cabin data it is clearly visible that it is combination of three data so, let's seperate that data and make three features out of it.

In [None]:
def seperator_Cabin(data):
  new = data["Cabin"].str.split("/", n = 2, expand = True)
  data["F1"]= new[0]
  data["F2"]= new[1].astype(int)
  data["F3"]= new[2]
  data = data.drop(['Cabin'],axis=1)
  return data

train = seperator_Cabin(train)
test = seperator_Cabin(test)

In [None]:
train.head(2)

## Label Encoding

In [None]:
for col in train.columns:
  if col == 'Transported':
    continue
  if train[col].dtype=='object' or train[col].dtype=='bool':
    le = LabelEncoder()
    x = set(test[col]).union(set(train[col]))
    le.fit(list(x))
    train[col]=le.transform(train[col])
    test[col]=le.transform(test[col])

# Adding Feature:
#### LeasureBill - Sum of the total bill of an individual on all the services used on board.



In [None]:
train['LeasureBill'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']
test['LeasureBill'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

#### Checking whether there are some highly correlated features in the finally prepared data for training.

In [None]:
plt.figure(figsize=(15,15))
sb.heatmap(train.corr()>0.8, annot=True)
plt.show()

In [None]:
features = train.drop(['Transported', 'RoomNo'], axis=1)
target = train.Transported.astype(int)


# As the values in the data are at different scale so, normalizing it may help
# the training process to be smooth.
scaler = StandardScaler()
features = scaler.fit_transform(features)


# Spliting the data to check the performance of the model.
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.1, random_state=10)

## Training Different Models:
* `Logistiic Regression`
* `XGBClassifier`
* `SVM` with and `Radial Basis Function kernel` to learn complex non-linear function for the classsification task.

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
print('Support Vector Machine:')
model.fit(X_train, Y_train)

print('Train Accuracy : ',metrics.accuracy_score(Y_train, model.predict(X_train)))
print('Validation Accuracy : ',metrics.accuracy_score(Y_val, model.predict(X_val)))

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=8, gamma=5)

model_xgb.fit(X_train, Y_train, 
              early_stopping_rounds=10,
              verbose=False, 
              eval_set=[(X_val, Y_val)], 
              eval_metric='logloss')

print('XGBoost Classifier:')
print('Training Accuracy : ',metrics.accuracy_score(Y_train, model_xgb.predict(X_train)))
print('Validation Accuracy : ',metrics.accuracy_score(Y_val, model_xgb.predict(X_val)))

In [None]:
from sklearn import svm
model_svm = svm.SVC(kernel="rbf")

model_svm.fit(X_train, Y_train)
print('Support Vector Machine:')
print('Training Accuracy : ',metrics.accuracy_score(Y_train, model_svm.predict(X_train)))
print('Validation Accuracy : ',metrics.accuracy_score(Y_val, model_svm.predict(X_val)))

## Evaluation Arguments:
* #### SVM and Logistic Regression are performing quite similary on the test data.
* #### XGBClassifier is providing us with the highest performance than svm and logistic but seems like it is overfitting the training data.

# Neural Network based model:
What motivates me to train a simple neural network model is that while using `svm` I observed that it perform similar to *logistic regression* wither **`kernel='linear' or 'poly'`** but it performs better when used with **`kernel='rbf'`**?

<br>Hence, I would like to give it a try to train a neural network on this.<br><br>
PS: Current high score of mine is given by this model only.

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

model_nn = keras.Sequential([
      layers.Dense(64, activation='relu',input_shape=[15]),
      layers.Dense(1, activation='sigmoid')
])

model_nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['binary_accuracy']
)

In [None]:
history = model_nn.fit(X_train, Y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_data = (X_val, Y_val))

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:,['loss','val_loss']].plot()
history_df.loc[:,['binary_accuracy','val_binary_accuracy']].plot()
plt.show()

### Predictions for the test data.

In [None]:
test = test.drop(['RoomNo'], axis=1)
test_set = scaler.transform(test)

In [None]:
ss = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
ss['Transported'] = model_xgb.predict(test_set).astype(bool)
ss.to_csv('Submission_xgb.csv', index=False)
ss.head(3)

In [None]:
ss['Transported'] = model_nn.predict(test_set)>0.5
ss.to_csv('Submission_nn.csv', index=False)
ss.head(3)

In [None]:
ss['Transported'] = model_svm.predict(test_set).astype(bool)
ss.to_csv('Submission_svm.csv', index=False)
ss.head(3)