# December TPS with Fastai 

This notebook builds on https://www.kaggle.com/casati8/kaggle-tps-dec2021-fastai to demo how the fastai library can be used in this competetion. 

In [None]:
from fastai.tabular.all import * 
from os import sys

In [None]:
path = Path('../input/tabular-playground-series-dec-2021')
train = pd.read_csv(os.path.join(path, 'train.csv'))
test = pd.read_csv(os.path.join(path, 'test.csv'))
sub = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

train.isna().sum().sum(), test.isna().sum().sum(), train.isnull().sum().sum(), test.isnull().sum().sum()

In [None]:
dep_var = 'Cover_Type'
train[dep_var].unique()

Reduce the memory usage for training

In [None]:
memory_usage_before = train.memory_usage().sum() / 1024**2
train = df_shrink(train)
memory_usage_after = train.memory_usage().sum() / 1024**2

print('Memory usage (MByte) before the shrinking:', memory_usage_before, ' , after the shrinking: ', memory_usage_after)

Feature engineering

In [None]:
def pre_process(df):
    df.loc[df["Aspect"] < 0,"Aspect"] += 360
    df.loc[df["Aspect"] > 359,"Aspect"] -= 360
    
    hill_features = [x for x in df.columns if x.startswith("Hillshade")]
    for col in hill_features:
        df[col] = np.clip(df[col], a_min=0, a_max=255)
        
    df["hillshade_mean"] = df[hill_features].mean(axis = 1)
    df['hillshade_amp'] = df[hill_features].max(axis = 1) - df[hill_features].min(axis = 1)
    df["hillshade_count"] = df[hill_features].sum(axis = 1)
    
    df['hydrology_elevation'] = df['Vertical_Distance_To_Hydrology'] - df['Elevation']
    
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    df["soiltype_count"] = df[soil_features].sum(axis=1)

    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    df["wilderness_count"] = df[wilderness_features].sum(axis = 1)  
    
    df['soiltype_label']=0
    df["soiltype_label"] = df["soiltype_label"].astype(np.int64)
    
    for i in range(len(soil_features)):
        df['soiltype_label']+= df[soil_features[i]]*2**i
        
    df.drop(columns=soil_features,inplace=True)
   
    df['wilderness_label']=0
    
    for i in range(len(wilderness_features)):
        df['wilderness_label']+= df[wilderness_features[i]]*2**i
        
    df.drop(columns=wilderness_features,inplace=True)
    

In [None]:
pre_process(train)
pre_process(test)   

Treat wilderness area and soil type labels as categorical and all other features as continuous

In [None]:
cat_vars = ['wilderness_label','soiltype_label']
cont_vars = list(set(train.columns).difference(cat_vars+['Id']+[dep_var]))
cont_vars, cat_vars

Learn and calculate model average

In [None]:
cols = [dep_var+'_'+str(i+1) for i in range(7)]
pred = pd.DataFrame(columns=cols,index=sub.index).fillna(0)
log = pd.DataFrame(columns=['Loss Train','Loss Validation','Accuracy'])

cbs = [ EarlyStoppingCallback(monitor='valid_loss', min_delta=0.0, patience=10),
        SaveModelCallback(monitor='valid_loss', comp=None, min_delta=0.0, 
                          fname='fastai', every_epoch=False, 
                          with_opt=False, reset_on_fit=True)] 
n=3
for i in range(n):
    splits = RandomSplitter(valid_pct=0.05)(range_of(train))
    to = TabularPandas(
        train,
        y_names=dep_var,
        y_block = CategoryBlock,
        cont_names = cont_vars,
        cat_names = cat_vars,
        procs = [Categorify,Normalize],
        splits=splits)
    
    loaders = to.dataloaders(bs=1024*8)
    learn = tabular_learner(loaders, metrics=accuracy,layers=[128, 64, 64, 16],cbs=cbs)
    print ('Start learning - Iter',i)
    learn.fit_one_cycle(50,0.01,wd=0.2)
    loss = learn.recorder.log[1:4]
    log.loc[len(log)] = loss
    dl = learn.dls.test_dl(test[cont_vars+cat_vars])
    nn_preds, _ = learn.get_preds(dl=dl) 
    pred[cols]+= nn_preds.numpy()/n

log.describe()

Review the learning rate

In [None]:
learn.lr_find()

Submission

In [None]:
sub[dep_var] = np.argmax(pred.to_numpy(), axis=1) +1
sub.to_csv("first_attempt.csv", index=False)

In [None]:
sub.info()