# Tabular models

In [1]:
from fastai.tabular import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Tabular data should be in a Pandas `DataFrame`.

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [3]:
path

PosixPath('/projects/da_workspace/szspace/.fastai/data/adult_sample')

In [3]:
df.head(2)
df.shape

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k


(32561, 15)

In [4]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]

In [5]:
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [6]:
test[:2]

TabularList (2 items)
age                               45
workclass                    Private
fnlwgt                         96975
education               Some-college
education-num                    NaN
marital-status              Divorced
occupation         Handlers-cleaners
relationship               Unmarried
race                           White
sex                           Female
capital-gain                       0
capital-loss                       0
hours-per-week                    40
native-country         United-States
salary                          <50k
Name: 800, dtype: object,age                                46
workclass                Self-emp-inc
fnlwgt                         192779
education                 Prof-school
education-num                     NaN
marital-status     Married-civ-spouse
occupation             Prof-specialty
relationship                  Husband
race                            White
sex                              Male
capital-gain      

In [7]:
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(800,1000)))
                           .label_from_df(cols=dep_var)
                           .add_test(test)
                           .databunch())

In [8]:
df.loc[:800,:].workclass.nunique() # 9 classes
df.loc[:800,:].education.nunique() # 

8

15

In [9]:
data.show_batch(rows=5)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,target
Self-emp-inc,Some-college,Married-civ-spouse,Sales,Husband,White,False,0.3968,1.2445,-0.0312,>=50k
Private,HS-grad,Never-married,Adm-clerical,Not-in-family,White,False,-0.9959,-0.4404,-0.4224,<50k
Local-gov,Some-college,Never-married,Prof-specialty,Not-in-family,White,False,0.2502,0.6172,-0.0312,<50k
Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,False,0.9098,0.0184,-0.0312,>=50k
Self-emp-not-inc,Some-college,Never-married,Farming-fishing,Own-child,White,False,-1.1425,0.4464,-0.0312,<50k


In [44]:
data.valid_ds

LabelList (200 items)
x: TabularList
workclass  Private; education  Some-college; marital-status  Divorced; occupation  Handlers-cleaners; relationship  Unmarried; race  White; education-num_na True; age 0.4701; fnlwgt -0.8793; education-num -0.0312; ,workclass  Self-emp-inc; education  Prof-school; marital-status  Married-civ-spouse; occupation  Prof-specialty; relationship  Husband; race  White; education-num_na True; age 0.5434; fnlwgt 0.0290; education-num -0.0312; ,workclass  Private; education  Assoc-voc; marital-status  Divorced; occupation #na#; relationship  Not-in-family; race  White; education-num_na True; age -0.1896; fnlwgt 1.7704; education-num -0.0312; ,workclass  Federal-gov; education  Bachelors; marital-status  Never-married; occupation  Tech-support; relationship  Not-in-family; race  White; education-num_na True; age -0.9959; fnlwgt -1.3242; education-num -0.0312; ,workclass  Private; education  Bachelors; marital-status  Married-civ-spouse; occupation #na#; relatio

In [42]:
(cat_x, cont_x), y = next(iter(data.test_dl))
for o in cat_x, cont_x, y:
    print(to_np(o[:3]))

[[ 5 16  1  7  5  5  2]
 [ 6 15  3 11  1  5  2]
 [ 5  9  1  0  2  5  2]]
[[ 0.470051 -0.879302 -0.031209]
 [ 0.54335   0.029003 -0.031209]
 [-0.189641  1.770412 -0.031209]]
[0 0 0]


In [20]:
(cat_x, cont_x), y = next(iter(data.train_dl))
for o in cat_x, cont_x, y:
    print(to_np(o[:3]))

[[ 5 16  1 11  2  5  1]
 [ 7 10  5 13  2  5  1]
 [ 5 16  5 15  2  5  1]]
[[ 1.936033  0.50116  -0.031209]
 [ 0.396752 -0.655895  1.142229]
 [-1.215828  1.141801 -0.031209]]
[1 1 0]


In [32]:
np.unique(to_np(cat_x[:,0]))
np.unique(to_np(cat_x[:,1]))

array([1, 2, 3, 5, 6, 7, 8])

array([ 1,  2,  3,  7,  9, 10, 11, 12, 13, 15, 16])

In [12]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [13]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(8, 5)
    (3): Embedding(16, 8)
    (4): Embedding(7, 5)
    (5): Embedding(6, 4)
    (6): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=42, out_features=200, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=200, out_features=100, bias=True)
    (4): ReLU(inplace)
    (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=100, out_features=2, bias=True)
  )
)

In [14]:
learn.summary()

TabularModel
Layer (type)         Output Shape         Param #    Trainable 
Embedding            [6]                  60         True      
______________________________________________________________________
Embedding            [8]                  136        True      
______________________________________________________________________
Embedding            [5]                  40         True      
______________________________________________________________________
Embedding            [8]                  128        True      
______________________________________________________________________
Embedding            [5]                  35         True      
______________________________________________________________________
Embedding            [4]                  24         True      
______________________________________________________________________
Embedding            [3]                  9          True      
_________________________________________________

In [34]:
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.368859,0.381024,0.815,00:06


## Inference

In [35]:
row = df.iloc[0]

In [46]:
row

age                                49
workclass                     Private
fnlwgt                         101320
education                  Assoc-acdm
education-num                      12
marital-status     Married-civ-spouse
occupation                        NaN
relationship                     Wife
race                            White
sex                            Female
capital-gain                        0
capital-loss                     1902
hours-per-week                     40
native-country          United-States
salary                          >=50k
Name: 0, dtype: object

In [39]:
y.unique()

tensor([0, 1], device='cuda:0')

In [36]:
learn.predict(row)

(Category >=50k, tensor(1), tensor([0.3838, 0.6162]))