In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
data = pd.read_csv('../input/spaceship-titanic/train.csv')
data_ind_var = data.iloc[:,:-1]
data_ind_var['train'] = 1
data_ind_var

In [None]:
# Shape of data
counts = data.iloc[:,:-1].nunique()
counts

In [None]:
data_test = pd.read_csv('../input/spaceship-titanic/test.csv')
data_test['train'] = 0
data_test

In [None]:
data_combined = pd.concat([data_ind_var, data_test])
data_combined

In [None]:
# Process


In [None]:
# Extract group and count. deck, cabin number, side.
data_combined['group'] = data_combined.PassengerId.str.split('_').str[0]
data_combined['group_count'] = data_combined.PassengerId.str.split('_').str[1]
data_combined['deck'] = data_combined.Cabin.str.split('/').str[0]
data_combined['cabin_number'] = data_combined.Cabin.str.split('/').str[1]
data_combined['cabin_side'] = data_combined.Cabin.str.split('/').str[2]
data_combined

In [None]:
data_combined.drop(['PassengerId', 'Cabin', 'Name'], axis = 1, inplace = True)
data_combined

In [None]:
# Convert group and group count to int
convert_dict = {'group': int,
                'group_count': int }  
  
data_combined.astype(convert_dict) 

In [None]:
# Move train column to the end
columns_list = list(data_combined.columns)
data_combined = data_combined[columns_list[0:10] +columns_list[11:]+[columns_list[10]]]
data_combined

In [None]:
X_train = data_combined[data_combined['train'] == 1]
X_test = data_combined[data_combined['train'] == 0]
X_train.drop(['train'], axis=1, inplace = True)
X_test.drop(['train'], axis=1, inplace = True)
X_test

In [None]:
# Convert all categorical variables to dummy. Creates an array. Ignores null values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
A = make_column_transformer(
    (OneHotEncoder(categories='auto', drop='first'), ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'cabin_side']), 
    remainder="passthrough")

X_train=A.fit_transform(X_train)
X_train

X_test=A.fit_transform(X_test)
X_test

In [None]:
# # Frequency Encoding
# # 'HomePlanet', CryoSleep', 'Destination', 'VIP', 'deck', 'cabin_side'
# hp_map = X_train['HomePlanet'].value_counts().to_dict()
# X_train['HomePlanet'] = X_train['HomePlanet'].map(hp_map)
# cs_map = X_train['CryoSleep'].value_counts().to_dict()
# X_train['CryoSleep'] = X_train['CryoSleep'].map(cs_map)
# ds_map = X_train['Destination'].value_counts().to_dict()
# X_train['Destination'] = X_train['Destination'].map(ds_map)
# vip_map = X_train['VIP'].value_counts().to_dict()
# X_train['VIP'] = X_train['VIP'].map(vip_map)
# deck_map = X_train['deck'].value_counts().to_dict()
# X_train['deck'] = X_train['deck'].map(deck_map)
# cabin_map = X_train['cabin_side'].value_counts().to_dict()
# X_train['cabin_side'] = X_train['cabin_side'].map(cabin_map)
# X_train

In [None]:
# # 'HomePlanet', CryoSleep', 'Destination', 'VIP', 'deck', 'cabin_side'
# # Frequency Encoding - https://medium.com/analytics-vidhya/how-to-handle-categorical-features-ab65c3cf498e
# hp_map = X_test['HomePlanet'].value_counts().to_dict()
# X_test['HomePlanet'] = X_test['HomePlanet'].map(hp_map)
# cs_map = X_test['CryoSleep'].value_counts().to_dict()
# X_test['CryoSleep'] = X_test['CryoSleep'].map(cs_map)
# ds_map = X_test['Destination'].value_counts().to_dict()
# X_test['Destination'] = X_test['Destination'].map(ds_map)
# vip_map = X_test['VIP'].value_counts().to_dict()
# X_test['VIP'] = X_test['VIP'].map(vip_map)
# deck_map = X_test['deck'].value_counts().to_dict()
# X_test['deck'] = X_test['deck'].map(deck_map)
# cabin_map = X_test['cabin_side'].value_counts().to_dict()
# X_test['cabin_side'] = X_test['cabin_side'].map(cabin_map)
# X_test

In [None]:
# # Label Encoding
# from sklearn.preprocessing import LabelEncoder
# label_cols = ["HomePlanet", "CryoSleep","deck", "Destination" ,"VIP", "cabin_side"]
# def label_encoder(train,test,columns):
#     for col in columns:
#         train[col] = train[col].astype(str)
#         test[col] = test[col].astype(str)
#         train[col] = LabelEncoder().fit_transform(train[col])
#         test[col] =  LabelEncoder().fit_transform(test[col])
#     return train, test

# X_train ,X_test = label_encoder(X_train,X_test ,label_cols)
# X_train

In [None]:
# Filling in missing values
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train = np.array(X_train)
imputer = imputer.fit(X_train[:,:])
X_train[:,:] = imputer.transform(X_train[:,:])
X_train_df = pd.DataFrame(X_train)
X_train_df

X_test = np.array(X_test)
imputer = imputer.fit(X_test[:,:])
X_test[:,:] = imputer.transform(X_test[:,:])
X_test_df = pd.DataFrame(X_test)
X_test_df

In [None]:
# convert dtype of each column from object to float - https://stackoverflow.com/questions/22481271/python-pandas-empty-correlation-matrix
X_train_df = X_train_df.astype('float')
X_train_df.dtypes

X_test_df = X_test_df.astype('float')
X_test_df.dtypes

In [None]:
# Variance Inflation Matrix. Linear Algebra Module.
# Corr needs a dataframe not an array
inv_corr_matrix = np.linalg.inv(X_train_df.corr())
inv_corr_matrix = pd.DataFrame(data = inv_corr_matrix, index = X_train_df.columns, columns = X_train_df.columns)
# corr = X_df.astype(float).corr()
vif_coefficients =  np.diag(np.array(inv_corr_matrix))
vif_coefficients

In [None]:
# Assume same mutlicollinearity in test data too
mutlicollinear_column_indices = [i for i in range(len(vif_coefficients)) if vif_coefficients[i] > 5]
# Remove last (train) index from array
# mutlicollinear_column_indices.pop()
mutlicollinear_column_indices

In [None]:
# Data with non collinear variables - not needed
X_train_df.drop(X_train_df.columns[mutlicollinear_column_indices], axis = 1, inplace = True)
X_train_df

X_test_df.drop(X_test_df.columns[mutlicollinear_column_indices], axis = 1, inplace = True)
X_test_df

In [None]:
# Feature scaling
# not needed - https://stackoverflow.com/questions/8961586/do-i-need-to-normalize-or-scale-data-for-randomforest-r-package

In [None]:
y = data.iloc[:,-1].values
y = y*1 # convert boolean to int
y

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 500) 
classifier.fit(X_train_df, y)

In [None]:
y_pred = classifier.predict(X_test_df)
y_pred

In [None]:
# R2, adjusted R^2
# Prediction on X_train

y_pred_train = classifier.predict(X_train_df)
y_pred_train.shape

import sklearn.metrics as metrics
N = y.shape[0]
p=27
r2=metrics.r2_score(y, y_pred_train)
a = (1-r2)
b = (N-1) / (N-p-1)
adj_rsquared = (1 - (a * b))
print("R2 : " , r2)
print("Adjusted-R2 : " , adj_rsquared)
N


In [None]:
# confusion matrix method - number of correct predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, y_pred_train)
cm

In [None]:
# Need PassengerId (from 1461) and Transported
data_test = pd.read_csv('../input/spaceship-titanic/test.csv')
id = data_test['PassengerId']
id = np.array(id)
id 

In [None]:
output = np.vstack((id,y_pred))
output = output.transpose()
output = pd.DataFrame(output)
output.columns = ['PassengerId', 'Transported']
# convert pandas column datatype - https://www.geeksforgeeks.org/convert-the-data-type-of-pandas-column-to-int/
convert_dict = {'PassengerId': str, 'Transported': bool
                 }  
  
output = output.astype(convert_dict)  
# print(output.dtypes)
output

In [None]:
output.to_csv('submission.csv', index=False)
# Change n to 500?!