In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
#getting our data
training_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [4]:
#splitting our training data
X = training_data.loc[:, training_data.columns != "Transported"]
y = training_data["Transported"]

In [6]:
#splitting further into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [9]:
# getting all preprocessors
from sklearn.preprocessing import OrdinalEncoder, StandardScaler #to encode categorical variables
from sklearn.impute import SimpleImputer #to impute numerical values
from sklearn.compose import ColumnTransformer #to perform ops of columns seperately
from sklearn.pipeline import Pipeline #this ties the entire thing together to define a pipeline for all processing steps

In [10]:
#scaleing should be done after imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
#encode first then impute
#setting unknown value to be an arbitrary one
#very ineffecient
categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=9999)),
    ('imputer', SimpleImputer(strategy='constant'))
])

In [11]:
#splitting the categorical and numerical column names into different lists
numeric_columns = X_train.select_dtypes(np.float64).columns
categorical_columns = X_train.select_dtypes(object).columns
numeric_columns, categorical_columns

(Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object'),
 Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',
        'Name'],
       dtype='object'))

In [12]:
#specifying a preprocessor
preprocessor = ColumnTransformer(
transformers = [
    ('numeric',numeric_transformer,numeric_columns),
    ('categorical', categorical_transformer, categorical_columns)
])

In [13]:
#the pipeline including preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#defining the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor', LinearDiscriminantAnalysis())
])


In [29]:
#fitting our model
linDiscModel = pipeline.fit(X_train,y_train)
print(linDiscModel)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                  unknown_value=9999)),
                                                                  ('i

In [30]:
#transforming our test set to be normalized as per our training set
normalized_X_test = linDiscModel.transform(X_test)

figure out how to proceed with the normalized test set

In [32]:
# getting predictions and testing accuracy
from sklearn.metrics import accuracy_score
predictions = linDiscModel.predict(X_test)
accuracy_score(y_test, predictions)

0.7388803680981595

In [35]:
# getting cross validation accuracy score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(linDiscModel, X_train, y_train, cv=kfold, scoring='accuracy')
results.mean()

0.7174984875983061

In [44]:
# applying our model to real data
final_predictions = linDiscModel.predict(test_data)
final_predictions

array([ True, False,  True, ...,  True, False,  True])

In [48]:
test_data["Transported"] = final_predictions
submissions_final = test_data[["PassengerId","Transported"]]
submissions_final

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [49]:
submissions_final.to_csv("submission.csv", index=False)

__notebook_source__.ipynb  submission.csv
