In [1]:
!python --version

Python 3.10.12


Test notebook to try out the Amazon SageMaker Studio. Lets get a submission to the Spaceship Titatic Kaggle competition.

Lets get the basic data loaded

In [2]:
!pip install kaggle



Lets get the Kaggle token installed. Manually move the kaggle jason file to the root .kaggle folder.

Lets download the data

In [3]:
!kaggle competitions download -c spaceship-titanic

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


Now to unzip it.

In [4]:
# !unzip ./spaceship-titanic.zip

Cool. Lets get the basic data into a dataframe.

In [5]:
import pandas as pd
import numpy as np
tdata = pd.read_csv('train.csv')

In [6]:
tdata.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Lets get some basic summary statistics going.

In [7]:
tdata.describe(include = 'all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


Lets check for missing values

In [8]:
tdata.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [9]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Ok so we now need to convert some of these 'objects' to boolean to ensure it plays nicely with FastAI.

In [10]:
tdata['VIP']=tdata['VIP'].astype('bool')
tdata['CryoSleep']=tdata['CryoSleep'].astype('bool')

Note: We are basically making the assumption that NA values are 0.

In [11]:
tdata.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep         0
Cabin           199
Destination     182
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [12]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep          bool
Cabin            object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

We basically now need to turns all the bool into uint8 to fix an error with FastAi code.

In [13]:
# # workaround for fastai/pytorch bug where bool is treated as object and thus erroring out.
# for n in tdata:
#     if pd.api.types.is_bool_dtype(tdata[n]):
#         tdata[n] = tdata[n].astype('uint8')

In [14]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep          bool
Cabin            object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Cool. Lets get a basic linear regression model going.

In [18]:
cat_vars = list(tdata.select_dtypes(['object','bool']).columns)
cat_vars.remove('Name')
cat_vars.remove('PassengerId')
cat_vars.remove('Transported')
print(cat_vars)

cont_vars = list(tdata.select_dtypes(['int','float']).columns)
print(cont_vars)

not_used_vars = ['Name','PassengerId']
print(not_used_vars)

total_columns = len(cont_vars)+len(cat_vars)+len(not_used_vars)
print(total_columns)
tdata.shape

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
['Name', 'PassengerId']
13


(8693, 14)

In [19]:
tdata['Transported'].dtype
# dep_vars = list(tdata.select_dtypes('bool').columns)
dep_vars = ['Transported']
print(dep_vars)

['Transported']


In [20]:
# !pip install fastai
# !conda install -c fastchan fastai

In [34]:
from fastai.tabular import *

Lets get all the basic column names into variables. This will come in handy later.

Lets get a basic tabular pandas model going.

In [None]:
# for n in tdata:
#     if pd.api.types.is_object_dtype(tdata[n]):
#         tdata[n] = tdata[n].astype('category')

Critical error here!! Make sure you dont pass in strings!! You are trying to pass in the actual objects!!

In [43]:
procs = [Categorify, FillMissing, Normalize]

In [23]:
splits = RandomSplitter(valid_pct=0.2)(tdata)

In [44]:
to = TabularPandas(tdata,splits = splits, do_setup=True, procs = procs,
                   cat_names = cat_vars, cont_names = cont_vars,
                   y_names = dep_vars, y_block = RegressionBlock())
to.show()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age_na,RoomService_na,FoodCourt_na,ShoppingMall_na,Spa_na,VRDeck_na,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
4312,Europa,False,C/144/P,TRAPPIST-1e,False,False,False,False,False,False,False,27.0,267.0,2802.0,0.0,54.0,77.0,True
969,Mars,False,F/211/P,55 Cancri e,False,False,False,False,False,False,False,3.0,0.0,0.0,0.0,0.0,0.0,True
4084,Europa,False,C/137/P,55 Cancri e,False,False,False,False,False,False,False,54.0,0.0,1832.0,0.0,0.0,1768.0,False
1735,Earth,False,G/291/S,TRAPPIST-1e,False,True,False,False,False,False,False,27.0,181.0,0.0,2.0,0.0,663.0,False
8470,Europa,False,C/335/S,TRAPPIST-1e,False,False,False,False,False,False,False,25.0,6899.0,265.0,0.0,2234.0,2090.0,False
7084,Earth,False,F/1442/S,TRAPPIST-1e,False,False,False,False,False,False,False,35.0,4.0,53.0,0.0,0.0,844.0,False
5693,Mars,False,E/385/P,PSO J318.5-22,False,False,False,False,False,False,False,26.0,2042.0,0.0,2387.0,10.0,0.0,True
1045,Mars,False,E/79/S,TRAPPIST-1e,True,False,False,False,False,False,False,38.0,936.0,0.0,363.0,0.0,0.0,False
2660,Earth,False,G/465/P,#na#,False,False,False,False,False,False,False,21.0,33.0,126.0,0.0,461.0,0.0,False
5134,Earth,False,G/880/P,TRAPPIST-1e,False,False,False,False,False,False,False,20.0,0.0,9.000031,82.0,14.0,704.0,False


In [25]:
to.xs.iloc[:2]

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4312,Europa,False,C/144/P,TRAPPIST-1e,False,27.0,267.0,2802.0,0.0,54.0,77.0
969,Mars,False,F/211/P,55 Cancri e,False,3.0,0.0,0.0,0.0,0.0,0.0


In [26]:
to.cats

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP
4312,Europa,False,C/144/P,TRAPPIST-1e,False
969,Mars,False,F/211/P,55 Cancri e,False
4084,Europa,False,C/137/P,55 Cancri e,False
1735,Earth,False,G/291/S,TRAPPIST-1e,False
8470,Europa,False,C/335/S,TRAPPIST-1e,False
...,...,...,...,...,...
3015,Earth,False,G/532/P,55 Cancri e,False
4505,Earth,False,F/904/S,TRAPPIST-1e,False
1233,Europa,False,E/94/S,55 Cancri e,False
4470,Mars,False,F/895/S,TRAPPIST-1e,False


In [27]:
to.conts

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4312,27.0,267.0,2802.0,0.0,54.0,77.0
969,3.0,0.0,0.0,0.0,0.0,0.0
4084,54.0,0.0,1832.0,0.0,0.0,1768.0
1735,,181.0,0.0,2.0,0.0,663.0
8470,25.0,6899.0,265.0,0.0,2234.0,2090.0
...,...,...,...,...,...,...
3015,8.0,0.0,0.0,0.0,0.0,0.0
4505,22.0,0.0,0.0,0.0,778.0,0.0
1233,30.0,278.0,3300.0,0.0,8145.0,2519.0
4470,36.0,850.0,,454.0,0.0,0.0


In [28]:
to.xs

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4312,Europa,False,C/144/P,TRAPPIST-1e,False,27.0,267.0,2802.0,0.0,54.0,77.0
969,Mars,False,F/211/P,55 Cancri e,False,3.0,0.0,0.0,0.0,0.0,0.0
4084,Europa,False,C/137/P,55 Cancri e,False,54.0,0.0,1832.0,0.0,0.0,1768.0
1735,Earth,False,G/291/S,TRAPPIST-1e,False,,181.0,0.0,2.0,0.0,663.0
8470,Europa,False,C/335/S,TRAPPIST-1e,False,25.0,6899.0,265.0,0.0,2234.0,2090.0
...,...,...,...,...,...,...,...,...,...,...,...
3015,Earth,False,G/532/P,55 Cancri e,False,8.0,0.0,0.0,0.0,0.0,0.0
4505,Earth,False,F/904/S,TRAPPIST-1e,False,22.0,0.0,0.0,0.0,778.0,0.0
1233,Europa,False,E/94/S,55 Cancri e,False,30.0,278.0,3300.0,0.0,8145.0,2519.0
4470,Mars,False,F/895/S,TRAPPIST-1e,False,36.0,850.0,,454.0,0.0,0.0


Lets create a dataloader

In [29]:
to.cats.dtypes

HomePlanet     category
CryoSleep          bool
Cabin          category
Destination    category
VIP                bool
dtype: object

SO! The following cell was just to resolve the error I got using FastAI. Critical error was not passing in proc objects - I kept sending in strings!

In [46]:
tensor(to.cats)

tensor([[   2,    1,  664,  ...,    1,    1,    1],
        [   3,    1, 3445,  ...,    1,    1,    1],
        [   2,    1,  654,  ...,    1,    1,    1],
        ...,
        [   2,    1, 2175,  ...,    1,    1,    1],
        [   3,    1, 4401,  ...,    1,    1,    1],
        [   1,    1, 4067,  ...,    1,    1,    1]], dtype=torch.int16)

In [47]:
dls = to.dataloaders(path = '.')
dls.show_batch()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age_na,RoomService_na,FoodCourt_na,ShoppingMall_na,Spa_na,VRDeck_na,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Earth,False,F/1752/S,55 Cancri e,False,False,False,False,False,False,False,33.0,1006.000003,-9e-06,1357.000018,45.00001,5.999999,0.0
1,Earth,False,F/1243/P,TRAPPIST-1e,False,False,False,False,False,False,False,24.0,2e-06,-9e-06,-6e-06,14.0,630.999988,0.0
2,Earth,False,F/799/S,TRAPPIST-1e,False,False,False,False,False,False,False,55.000001,2e-06,252.000001,0.999996,-9.417345e-07,552.999998,1.0
3,Mars,False,F/1663/P,TRAPPIST-1e,False,False,False,False,False,False,False,29.0,1543.999955,0.999988,205.999999,-9.417345e-07,-1.2e-05,0.0
4,Earth,False,F/701/P,TRAPPIST-1e,False,False,False,False,False,False,False,19.0,90.000003,-9e-06,-6e-06,74.0,594.99999,0.0
5,Earth,False,G/109/P,TRAPPIST-1e,False,False,False,False,False,False,False,0.999999,2e-06,-9e-06,-6e-06,-9.417345e-07,-1.2e-05,0.0
6,Mars,False,D/287/P,TRAPPIST-1e,False,False,False,False,False,False,False,17.0,3146.000067,-9e-06,87.000001,135.0,-1.2e-05,0.0
7,Earth,False,F/1319/P,55 Cancri e,False,False,False,False,False,False,False,54.0,800.999997,4.999977,-6e-06,-9.417345e-07,-1.2e-05,1.0
8,Earth,False,E/492/S,PSO J318.5-22,False,False,False,False,False,False,False,21.0,2e-06,434.0,-6e-06,75.0,134.999997,1.0
9,Europa,False,C/32/P,55 Cancri e,True,False,False,False,False,False,False,27.0,1.000008,12804.000417,-6e-06,9.999993,51.999998,1.0
