# Modeling

## Setup

In [2]:
%matplotlib inline

In [27]:
from pathlib import Path
import pandas as pd
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix

In [4]:
root_path = Path.cwd().parent
interim_data_path = root_path/"data/interim"

## Load data

In [5]:
df = pd.read_parquet(interim_data_path/"train.parquet")

In [6]:
df.shape

(463715, 91)

## Specify types

In [7]:
df.year = df.year.astype("category")

In [8]:
df.dtypes

year       category
feat_0      float64
feat_1      float64
feat_2      float64
feat_3      float64
             ...   
feat_85     float64
feat_86     float64
feat_87     float64
feat_88     float64
feat_89     float64
Length: 91, dtype: object

## Preprocessing

In [9]:
RANDOM_STATE=1337
N_JOBS=-1

In [10]:
cont_feat, cat_feat = cont_cat_split(df, dep_var="year")

In [11]:
splits = RandomSplitter(valid_pct=0.2, seed=RANDOM_STATE)(range_of(df))

In [12]:
to = TabularPandas(df=df, procs=[Normalize], cat_names=cat_feat, cont_names=cont_feat, y_names="year", splits=splits)

In [13]:
len(to.train)

370972

In [14]:
len(to.valid)

92743

In [15]:
dls = to.dataloaders(bs=1024)

In [16]:
X_train, y_train = to.train.xs, to.train.y
X_valid, y_valid = to.valid.xs, to.valid.y

### Random Forrest

Apply best practices from fastai book, i.e. setting max_samples to *200.000* if dealing with more than 200.000 samples

In [17]:
rf_model = RandomForestClassifier(n_estimators=10, 
                                  max_samples=200_000, 
                                  max_features=0.5, 
                                  min_samples_leaf=4, 
                                  oob_score=True,
                                  n_jobs=N_JOBS, 
                                  random_state=RANDOM_STATE)

In [18]:
rf_model.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(max_features=0.5, max_samples=200000, min_samples_leaf=4,
                       n_estimators=10, n_jobs=-1, oob_score=True,
                       random_state=1337)

In [19]:
y_pred_valid = rf_model.predict(X_valid)

In [24]:
accuracy_score(y_valid, y_pred_valid)

0.08816837928468994

In [None]:
con

ValueError: The number of FixedLocator locations (59), usually from a call to set_ticks, does not match the number of ticklabels (89).

### Neural Net

In [None]:
learn = tabular_learner(dls, metrics=accuracy)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1)

In [None]:
learn.show_results()