Setup

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
#Download kernel data
!mkdir data
!kaggle datasets download neuromusic/avocado-prices -p ./data

In [1]:
#Libraries
import pandas as pd
import numpy as np
import re as re

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

In [2]:
df_avocado = pd.read_csv('data/avocado.csv')

In [3]:
df_avocado.drop(list(df_avocado)[0], axis = 1, inplace = True)
df_avocado.drop(labels = 'year', axis = 1, inplace = True)

#convert to snake case
df_avocado.columns = [i.replace(" ", "_") for i in df_avocado.columns]
df_avocado.columns = [re.sub("(?<=[a-z])(?=[A-Z])", "_", i) for i in df_avocado.columns]
df_avocado.columns = [i.lower() for i in df_avocado.columns]

EDA

In [None]:
df_avocado.head(5)

In [None]:
df_avocado.dtypes

In [4]:
df_avocado['avo_type_sum'] = df_avocado['4046'] + df_avocado['4225'] + df_avocado['4770']

In [None]:
#Sometimes the total volume differs

In [5]:
total_test = df_avocado['avo_type_sum'] + df_avocado['total_bags'] == df_avocado['total_volume']

In [None]:
len(total_test)

In [None]:
sum(total_test)

In [6]:
#Add seasonal variables
add_datepart(df_avocado, 'date', drop = True)

In [7]:
#convert to snake case
df_avocado.columns = [i.replace(" ", "_") for i in df_avocado.columns]
df_avocado.columns = [re.sub("(?<=[a-z])(?=[A-Z])", "_", i) for i in df_avocado.columns]
df_avocado.columns = [i.lower() for i in df_avocado.columns]


Modelling

In [None]:
df_avocado.columns

In [8]:
#Convert variables to treat as categories for embeddings
factor_vars = [
    'region'
]

ordered_factors = ['year', 'month', 'week', 'day']

for f in factor_vars:
    df_avocado[f] = df_avocado[f].astype('category')
    
for f in ordered_factors:
    df_avocado[f] = df_avocado[f].astype('category').cat.as_ordered()

In [None]:
#Look at dims
[(c, len(df_avocado[c].cat.categories)) for c in factor_vars]

In [9]:
#Convert type to bool
df_avocado['type'] = df_avocado['type'] == 'conventional'

In [10]:
#Convert all non-cat variables to float
cat_vars = (factor_vars + ordered_factors)

for f in df_avocado.columns:
    if(np.logical_not(np.isin(f, cat_vars))):
        df_avocado[f] = df_avocado[f].fillna(0).astype('float32')

In [None]:
#save to disk
df_avocado.to_feather('data/model_data')

In [11]:
#Create data split
train_idx = list(range(0, int(len(df_avocado) * .9)))
val_idx = list(range(int(len(df_avocado) * .8) + 1, int(len(df_avocado) * .9)))
test_idx = list(range(int(len(df_avocado) * .9) + 1, int(len(df_avocado))))

df_train = df_avocado.copy().iloc[train_idx]
# df_val = df_avocado.copy().iloc[val_idx]
df_test = df_avocado.copy().iloc[test_idx]

In [12]:
train, y, nas, mapper = proc_df(df_train, y_fld = 'average_price', do_scale=True)
test, _, nas, mapper =  proc_df(df_test, y_fld = 'average_price', do_scale=True, mapper=mapper, na_dict=nas)

In [13]:
yl = np.log(y)
train.head(5)

Unnamed: 0,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,type,region,...,day,dayofweek,dayofyear,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start,elapsed
0,-0.240181,-0.243302,-0.214377,-0.224003,-0.242829,-0.241757,-0.228827,-0.185359,0.894256,1,...,27,0.0,1.719896,-0.176466,-0.195228,-0.072552,-0.102573,-0.072552,-0.072127,-0.568405
1,-0.24276,-0.243575,-0.222134,-0.223913,-0.242049,-0.240731,-0.22881,-0.185359,0.894256,1,...,20,0.0,1.654468,-0.176466,-0.195228,-0.072552,-0.102573,-0.072552,-0.072127,-0.589759
2,-0.225307,-0.243484,-0.171152,-0.223274,-0.243361,-0.242473,-0.228788,-0.185359,0.894256,1,...,13,0.0,1.58904,-0.176466,-0.195228,-0.072552,-0.102573,-0.072552,-0.072127,-0.611112
3,-0.236115,-0.24323,-0.20053,-0.223786,-0.245614,-0.245489,-0.228669,-0.185359,0.894256,1,...,6,0.0,1.523612,-0.176466,-0.195228,-0.072552,-0.102573,-0.072552,-0.072127,-0.632466
4,-0.243817,-0.243374,-0.222767,-0.223758,-0.245254,-0.245095,-0.22842,-0.185359,0.894256,1,...,29,0.0,1.458183,-0.176466,-0.195228,-0.072552,-0.102573,-0.072552,-0.072127,-0.65382


In [14]:
model_data = ColumnarModelData.from_data_frame('data/',  val_idx, train, yl.astype(np.float32),
                                  cat_flds = (factor_vars + ordered_factors), bs=128, test_df=test) 

In [15]:
cat_sz = [(c, len(df_avocado[c].cat.categories)) for c in cat_vars]
cat_sz

[('region', 54), ('year', 4), ('month', 12), ('week', 53), ('day', 31)]

In [16]:
emb_szs = [(c, min(50, (c+2)//2)) for _,c in cat_sz]
emb_szs

[(54, 28), (4, 3), (12, 7), (53, 27), (31, 16)]

In [None]:
train.dtypes

In [21]:
max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)
# len(train.columns)-len(cat_vars)
m = model_data.get_learner(emb_szs, len(train.columns)-len(cat_vars),
                   0.04, 1, [10,5], [0.01,0.01], y_range=y_range)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generic/THCTensorCopy.c:20

In [22]:
m

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(54, 28)
    (1): Embedding(4, 3)
    (2): Embedding(12, 7)
    (3): Embedding(53, 27)
    (4): Embedding(31, 16)
  )
  (lins): ModuleList(
    (0): Linear(in_features=100, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=5, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True)
    (1): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True)
  )
  (outp): Linear(in_features=5, out_features=1, bias=True)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(
    (0): Dropout(p=0.01)
    (1): Dropout(p=0.1)
  )
  (bn): BatchNorm1d(19, eps=1e-05, momentum=0.1, affine=True)
)

In [23]:
m.lr_find()

RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/csrc/generic/serialization.cpp:38