In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
data = pd.read_csv('../input/spaceship-titanic/train.csv')
data_ind_var = data.iloc[:,:-1]
data_ind_var['train'] = 1
data_ind_var

In [None]:
# Shape of data
counts = data.iloc[:,:-1].nunique()
counts

In [None]:
data_test = pd.read_csv('../input/spaceship-titanic/test.csv')
data_test['train'] = 0
data_test

In [None]:
data_combined = pd.concat([data_ind_var, data_test])
data_combined


In [None]:
# Process

In [None]:
# Extract group and count. deck, cabin number, side.
data_combined['group'] = data_combined.PassengerId.str.split('_').str[0]
data_combined['group_count'] = data_combined.PassengerId.str.split('_').str[1]
data_combined['deck'] = data_combined.Cabin.str.split('/').str[0]
data_combined['cabin_number'] = data_combined.Cabin.str.split('/').str[1]
data_combined['cabin_side'] = data_combined.Cabin.str.split('/').str[2]
data_combined

In [None]:
data_combined.drop(['PassengerId', 'Cabin', 'Name'], axis = 1, inplace = True)
data_combined

In [None]:
# Convert group and group count to int
convert_dict = {'group': int,
                'group_count': int }  
  
data_combined.astype(convert_dict) 

In [None]:
# Move train column to the end
columns_list = list(data_combined.columns)
data_combined = data_combined[columns_list[0:10] +columns_list[11:]+[columns_list[10]]]
data_combined

In [None]:
# Convert all categorical variables to dummy at once. Creates an array. Ignores null values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
A = make_column_transformer(
    (OneHotEncoder(categories='auto', drop='first'), ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'cabin_side']), 
    remainder="passthrough")

X=A.fit_transform(data_combined)
X

In [None]:
# Filling in missing values
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])
# X_df = pd.DataFrame(X)
# converting sparse matrix to data frame - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sparse.from_spmatrix.html
# import scipy.sparse
# mat = scipy.sparse.eye(262)
# X_df = pd.DataFrame.sparse.from_spmatrix(X)
X_df = pd.DataFrame(X)
X_df

In [None]:
# convert dtype of each column from object to float - https://stackoverflow.com/questions/22481271/python-pandas-empty-correlation-matrix
X_df = X_df.astype('float')
X_df.dtypes

In [None]:
# Variance Inflation Matrix. Linear Algebra Module.
# Corr needs a dataframe not an array
# To calculate correlation data should be numeric - create dummy variables at once - https://stackoverflow.com/questions/22481271/python-pandas-empty-correlation-matrix
inv_corr_matrix = np.linalg.inv(X_df.corr())
inv_corr_matrix = pd.DataFrame(data = inv_corr_matrix, index = X_df.columns, columns = X_df.columns)
# corr = X_df.astype(float).corr()
vif_coefficients =  np.diag(np.array(inv_corr_matrix))
vif_coefficients

In [None]:
mutlicollinear_column_indices = [i for i in range(len(vif_coefficients)) if vif_coefficients[i] > 5]
# Remove last (train) index from array
mutlicollinear_column_indices.pop()
mutlicollinear_column_indices

In [None]:
# Data with non collinear variables
X_df.drop(X_df.columns[mutlicollinear_column_indices], axis = 1, inplace = True)
X_df

In [None]:
X_train = X_df[X_df[29] == 1.0]
X_test = X_df[X_df[29] == 0.0]
X_train.drop([29], axis=1, inplace = True)
X_test.drop([29], axis=1, inplace = True)
X_test

In [None]:
# Feature scaling for Logistic Regression
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
X_train

In [None]:
y = data.iloc[:,-1].values
y = y*1 # convert boolean to int
y

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y)

In [None]:
y_pred = classifier.predict(X_test)
y_pred

In [None]:
# R2, adjusted R^2
# Prediction on X_train

y_pred_train = classifier.predict(X_train)
y_pred_train.shape

import sklearn.metrics as metrics
N = y.shape[0]
p=27
r2=metrics.r2_score(y, y_pred_train)
a = (1-r2)
b = (N-1) / (N-p-1)
adj_rsquared = (1 - (a * b))
print("R2 : " , r2)
print("Adjusted-R2 : " , adj_rsquared)
N

In [None]:
# confusion matrix method - number of correct predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, y_pred_train)
cm

In [None]:
# Need PassnegerId (from 1461) and Transported
data_test = pd.read_csv('../input/spaceship-titanic/test.csv')
id = data_test['PassengerId']
id = np.array(id)
id 

In [None]:
output = np.vstack((id,y_pred))
output = output.transpose()
output = pd.DataFrame(output)
output.columns = ['PassengerId', 'Transported']
# convert pandas column datatype - https://www.geeksforgeeks.org/convert-the-data-type-of-pandas-column-to-int/
convert_dict = {'PassengerId': str
                 }  
  
output = output.astype(convert_dict)  
# print(output.dtypes)
output

In [None]:
output.to_csv('myfile.csv', index=False)
