In [1]:
!python --version

Python 3.8.17


Test notebook to try out the Amazon SageMaker Studio. Lets get a submission to the Spaceship Titatic Kaggle competition.

Lets get the basic data loaded

In [2]:
!pip install kaggle



Lets get the Kaggle token installed. Manually move the kaggle jason file to the root .kaggle folder.

Lets download the data

In [3]:
!kaggle competitions download -c spaceship-titanic

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


Now to unzip it.

In [4]:
# !unzip ./spaceship-titanic.zip

Cool. Lets get the basic data into a dataframe.

In [5]:
import pandas as pd
import numpy as np
tdata = pd.read_csv('train.csv')

In [6]:
tdata.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Lets get some basic summary statistics going.

In [7]:
tdata.describe(include = 'all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


Lets check for missing values

In [8]:
tdata.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [9]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Ok so we now need to convert some of these 'objects' to boolean to ensure it plays nicely with FastAI.

In [10]:
tdata['VIP']=tdata['VIP'].astype('bool')
tdata['CryoSleep']=tdata['CryoSleep'].astype('bool')

Note: We are basically making the assumption that NA values are 0.

In [11]:
tdata.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep         0
Cabin           199
Destination     182
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [12]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep          bool
Cabin            object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

We basically now need to turns all the bool into uint8 to fix an error with FastAi code.

In [13]:
# workaround for fastai/pytorch bug where bool is treated as object and thus erroring out.
for n in tdata:
    if pd.api.types.is_bool_dtype(tdata[n]):
        tdata[n] = tdata[n].astype('uint8')

In [14]:
tdata.dtypes

PassengerId      object
HomePlanet       object
CryoSleep         uint8
Cabin            object
Destination      object
Age             float64
VIP               uint8
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported       uint8
dtype: object

Cool. Lets get a basic linear regression model going.

In [15]:
cat_vars = list(tdata.select_dtypes(['object','uint8']).columns)
cat_vars.remove('Name')
cat_vars.remove('PassengerId')
cat_vars.remove('Transported')
print(cat_vars)

cont_vars = list(tdata.select_dtypes(['int','float']).columns)
print(cont_vars)

not_used_vars = ['Name','PassengerId']
print(not_used_vars)

total_columns = len(cont_vars)+len(cat_vars)+len(not_used_vars)
print(total_columns)
tdata.shape

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
['Name', 'PassengerId']
13


(8693, 14)

In [16]:
tdata['Transported'].dtype
# dep_vars = list(tdata.select_dtypes('bool').columns)
dep_vars = ['Transported']
print(dep_vars)

['Transported']


In [28]:
# !pip install fastai
!conda install -c fastchan fastai

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/linux-64::h5py==3.9.0=nompi_py38hc4a6f91_101
  - conda-forge/linux-64::hdf5==1.14.1=nompi_h4f84152_100
  - conda-forge/linux-64::krb5==1.21.1=h659d440_0
  - conda-forge/linux-64::libcups==2.3.3=h4637d8d_4
  - conda-forge/linux-64::libcurl==8.2.1=hca28451_0
  - conda-forge/linux-64::libedit==3.1.20191231=he28a2e2_2
  - conda-forge/linux-64::libpq==15.3=hfc447b1_2
  - conda-forge/linux-64::matplotlib==3.7.2=py38h578d9bd_0
  - conda-forge/linux-64::pyqt==5.15.9=py38hffdaa6c_4
  - conda-forge/linux-64::qt-main==5.15.8=h7fe3ca9_15
  - conda-forge/linux-64::tensorflow==2.12.1=cpu_py38h66f0ec1_0
  - conda-forge/linux-64::tensorflow-base==2.12.1=cpu_py38h92f4423_0
  - conda-forge/linux-64::tensorflow-estimator==2.12.1=cpu_py38h27d0da5_0
done

## Package Plan ##

  environ

In [18]:
from fastai.tabular.all import *

Lets get all the basic column names into variables. This will come in handy later.

Lets get a basic tabular pandas model going.

In [19]:
# for n in tdata:
#     if pd.api.types.is_object_dtype(tdata[n]):
#         tdata[n] = tdata[n].astype('category')

In [20]:
procs = ['Categorify', 'FillMissing', 'Normalize']

In [21]:
splits = RandomSplitter(valid_pct=0.2)(tdata)

In [22]:
to = TabularPandas(tdata,splits = splits, do_setup=True, procs = procs, 
                   cat_names = cat_vars, cont_names = cont_vars, 
                   y_names = dep_vars, y_block = RegressionBlock())
to.show()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
2747,Earth,1,G/482/P,TRAPPIST-1e,0,22.0,0.0,0.0,0.0,,0.0,0
50,Earth,0,G/6/S,TRAPPIST-1e,0,,4.0,0.0,2.0,4683.0,0.0,0
7628,Earth,0,G/1317/P,TRAPPIST-1e,0,20.0,0.0,142.0,0.0,3528.0,92.0,0
5614,Mars,0,F/1139/S,TRAPPIST-1e,0,43.0,1086.0,4.0,202.0,401.0,0.0,0
4928,Europa,1,B/202/S,TRAPPIST-1e,0,16.0,0.0,0.0,0.0,0.0,0.0,1
7834,Earth,1,G/1363/P,55 Cancri e,0,25.0,0.0,0.0,0.0,0.0,0.0,1
5673,Europa,0,B/232/S,TRAPPIST-1e,0,36.0,0.0,1828.0,9058.0,1.0,2.0,1
3841,Earth,0,E/255/P,TRAPPIST-1e,0,25.0,0.0,38.0,0.0,847.0,112.0,0
5746,Earth,0,G/991/S,TRAPPIST-1e,0,1.0,0.0,0.0,0.0,0.0,0.0,1
6780,Europa,1,C/263/S,PSO J318.5-22,0,32.0,0.0,0.0,0.0,0.0,0.0,1


In [23]:
to.xs.iloc[:2]

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2747,Earth,1,G/482/P,TRAPPIST-1e,0,22.0,0.0,0.0,0.0,,0.0
50,Earth,0,G/6/S,TRAPPIST-1e,0,,4.0,0.0,2.0,4683.0,0.0


In [24]:
to.cats

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP
2747,Earth,1,G/482/P,TRAPPIST-1e,0
50,Earth,0,G/6/S,TRAPPIST-1e,0
7628,Earth,0,G/1317/P,TRAPPIST-1e,0
5614,Mars,0,F/1139/S,TRAPPIST-1e,0
4928,Europa,1,B/202/S,TRAPPIST-1e,0
...,...,...,...,...,...
4815,Earth,0,F/1046/P,TRAPPIST-1e,0
3219,Earth,0,F/712/P,TRAPPIST-1e,0
19,Earth,0,G/0/P,TRAPPIST-1e,0
684,Earth,1,G/109/P,PSO J318.5-22,0


In [25]:
to.conts

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2747,22.0,0.0,0.0,0.0,,0.0
50,,4.0,0.0,2.0,4683.0,0.0
7628,20.0,0.0,142.0,0.0,3528.0,92.0
5614,43.0,1086.0,4.0,202.0,401.0,0.0
4928,16.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
4815,35.0,0.0,116.0,51.0,0.0,701.0
3219,51.0,90.0,0.0,0.0,881.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0
684,38.0,0.0,0.0,0.0,0.0,0.0


In [26]:
to.xs

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2747,Earth,1,G/482/P,TRAPPIST-1e,0,22.0,0.0,0.0,0.0,,0.0
50,Earth,0,G/6/S,TRAPPIST-1e,0,,4.0,0.0,2.0,4683.0,0.0
7628,Earth,0,G/1317/P,TRAPPIST-1e,0,20.0,0.0,142.0,0.0,3528.0,92.0
5614,Mars,0,F/1139/S,TRAPPIST-1e,0,43.0,1086.0,4.0,202.0,401.0,0.0
4928,Europa,1,B/202/S,TRAPPIST-1e,0,16.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4815,Earth,0,F/1046/P,TRAPPIST-1e,0,35.0,0.0,116.0,51.0,0.0,701.0
3219,Earth,0,F/712/P,TRAPPIST-1e,0,51.0,90.0,0.0,0.0,881.0,0.0
19,Earth,0,G/0/P,TRAPPIST-1e,0,0.0,0.0,0.0,0.0,0.0,0.0
684,Earth,1,G/109/P,PSO J318.5-22,0,38.0,0.0,0.0,0.0,0.0,0.0


Lets create a dataloader

In [27]:
dls = to.dataloaders(path = '.')
dls.show_batch()

Could not do one pass in your dataloader, there is something wrong in it. Please see the stack trace below:


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.