In [89]:
%matplotlib inline

In [260]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC

# Spaceship Titanic
### pipeline

In [211]:
spaceship_data_raw = pd.read_csv("data/train.csv")

In [212]:
spaceship_data_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [213]:
spaceship_data_raw.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [214]:
spaceship_data_raw.columns.tolist()

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported']

In [215]:
passenger_features = spaceship_data_raw.drop(columns="Transported")

In [216]:
passenger_labels = spaceship_data_raw.Transported

In [217]:
passenger_features_train, passenger_features_val, passenger_labels_train, passenger_labels_val = train_test_split(
    passenger_features, passenger_labels, test_size = 1000, random_state = 42
)

In [218]:
passenger_features_train.shape, passenger_features_val.shape, passenger_labels_train.shape, passenger_labels_val.shape

((7693, 13), (1000, 13), (7693,), (1000,))

In [237]:
numeric_features = [
             'Age',
             'RoomService',
             'FoodCourt',
             'ShoppingMall',
             'Spa',
             'VRDeck',
         ]

categorical_features = [
             'HomePlanet',
             'CryoSleep',
             'Destination',
             'VIP',
         ]
# other columns are ignored

In [249]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(drop = "first")),

    ]
)

In [250]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features),
    ],
     verbose_feature_names_out=False
)

In [251]:
preprocessor.fit(passenger_features_train)

In [252]:
preprocessor.get_feature_names_out()

array(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_True'],
      dtype=object)

In [303]:
len(preprocessor.get_feature_names_out())

12

In [289]:
pipe_lin_svm = Pipeline(
    [
        ("preprocess", preprocessor),
        ("lin_svm", LinearSVC(max_iter=10_000))
    ]
)

In [290]:
pipe_lin_svm.fit(passenger_features_train, passenger_labels_train)

In [291]:
pipe_lin_svm.score(passenger_features_train, passenger_labels_train)

0.7838294553490186

In [292]:
pipe_lin_svm.score(passenger_features_val, passenger_labels_val)

0.757

In [293]:
grid_search = GridSearchCV(pipe_lin_svm, param_grid={"lin_svm__C": [1e-3, 1, 1e3, 1e6]})

In [294]:
grid_search.fit(passenger_features_train, passenger_labels_train)



In [295]:
best_model = grid_search.best_estimator_

In [296]:
grid_search.best_params_

{'lin_svm__C': 1000.0}

In [297]:
best_model.score(passenger_features_train, passenger_labels_train)

0.7943585077343039

In [298]:
best_model.score(passenger_features_val, passenger_labels_val)

0.759

In [299]:
spaceship_test_raw = pd.read_csv("data/test.csv")

In [305]:
spaceship_test_raw.shape

(4277, 13)

In [306]:
predictions = best_model.predict(spaceship_test_raw)

In [316]:
submission = pd.DataFrame({"PassengerId": spaceship_test_raw.PassengerId})
submission["Transported"] = predictions

In [317]:
submission = submission.set_index("PassengerId")
submission

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [318]:
submission.to_csv("pipe_1.csv")