In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
#import sklearn.preprocessing.OneHotEncoder #to encode categorical variables
#import sklearn.linear_model.LogisticRegression #to perform logistic regression
#import sklearn.model_selection.train_test_split #to split our data to perform training and testing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
from sklearn.linear_model import LogisticRegression #to perform logistic regression
from sklearn.model_selection import train_test_split #to split our data to perform training and testing

In [3]:
X = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
X = X.drop(columns=["Name"]) #as name should have no bearing on a person's survival. If it does, something is wrong with this competition
X

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [4]:
#checking to see categorical vs numerical cols
X.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

In [5]:
#checking for nulls
X.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [6]:
#converting bool to 0 or 1
X["VIP"] = (X["VIP"] == True).astype(float)
X["CryoSleep"] = (X["CryoSleep"] == True).astype(float)

In [7]:
y = X["Transported"] #this is what we are looking for
X = X.drop(columns=["Transported"]) #remove y from X

In [8]:
#do training and test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [9]:
# getting all preprocessors
from sklearn.preprocessing import OrdinalEncoder, StandardScaler #to encode categorical variables
from sklearn.impute import SimpleImputer #to impute numerical values
from sklearn.compose import ColumnTransformer #to perform ops of columns seperately
from sklearn.pipeline import Pipeline #this ties the entire thing together to define a pipeline for all processing steps

In [10]:
#scaleing should be done after imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
#encode first then impute
#setting unknown value to be an arbitrary one
#very ineffecient
categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=9999)),
    ('imputer', SimpleImputer(strategy='constant'))
])

In [11]:
#splitting the categorical and numerical column names into different lists
numeric_columns = X.select_dtypes(np.float64).columns
categorical_columns = X.select_dtypes(object).columns
numeric_columns, categorical_columns

(Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
        'Spa', 'VRDeck'],
       dtype='object'),
 Index(['PassengerId', 'HomePlanet', 'Cabin', 'Destination'], dtype='object'))

In [12]:
#specifying a preprocessor
preprocessor = ColumnTransformer(
transformers = [
    ('numeric',numeric_transformer,numeric_columns),
    ('categorical', categorical_transformer, categorical_columns)
])

In [13]:
#defining the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor', LogisticRegression(max_iter=500))
])

In [14]:
logistic_reg_model = pipeline.fit(X_train,y_train)
print(logistic_reg_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                  unknown_value=9999)),
                                    

In [15]:
#checking how well we did
from sklearn.metrics import r2_score
predictions = logistic_reg_model.predict(X_test)
predictions

array([ True, False, False, ..., False,  True, False])

In [16]:
X_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
614,0642_02,Europa,0.0,C/25/S,55 Cancri e,36.0,0.0,0.0,4967.0,82.0,1517.0,575.0
441,0472_01,Earth,0.0,F/87/S,PSO J318.5-22,37.0,0.0,0.0,10.0,544.0,9.0,299.0
2030,2171_01,Earth,0.0,F/433/P,TRAPPIST-1e,24.0,0.0,388.0,0.0,7.0,409.0,0.0
4870,5196_01,Earth,1.0,G/839/P,TRAPPIST-1e,43.0,0.0,0.0,0.0,0.0,0.0,0.0
281,0313_01,Earth,0.0,F/68/P,PSO J318.5-22,60.0,0.0,7.0,540.0,1.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1316,1393_02,Mars,0.0,F/270/S,TRAPPIST-1e,19.0,0.0,2058.0,0.0,224.0,298.0,19.0
3493,3753_01,Earth,0.0,F/776/P,55 Cancri e,55.0,0.0,0.0,519.0,131.0,63.0,3.0
3089,3329_01,Earth,1.0,G/544/P,TRAPPIST-1e,55.0,0.0,0.0,0.0,0.0,0.0,0.0
4619,4923_03,Mars,0.0,F/935/S,TRAPPIST-1e,24.0,0.0,261.0,0.0,780.0,0.0,0.0


In [17]:
from sklearn.metrics import classification_report
classification_report(y_test,predictions)

'              precision    recall  f1-score   support\n\n       False       0.68      0.86      0.76       853\n        True       0.82      0.61      0.70       886\n\n    accuracy                           0.73      1739\n   macro avg       0.75      0.74      0.73      1739\nweighted avg       0.75      0.73      0.73      1739\n'

In [18]:
#getting actual test data
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [19]:
#making predictions on actual test data
predictions_final = logistic_reg_model.predict(test_data)

In [20]:
test_data["Transported"] = predictions_final
submissons_final = test_data[["PassengerId","Transported"]]

In [21]:
submissons_final.to_csv("submission.csv",index=False)