In [None]:
from sklearn.ensemble import RandomForestRegressor
from fastai.metrics import accuracy,rmse
from rfpimp import *
import math
import torch


def r_mse(x,y):
  return round(rmse(torch.tensor(x),torch.tensor(y)).item(),6)

class Randomforest():
  def __init__(self,**kwargs):
    self.n_estimators=kwargs['n_estimators']
    self.min_samples_leaf=kwargs['min_samples_leaf']
    self.max_samples=kwargs['max_samples']
    self.max_features=kwargs['max_features']
    self.n_jobs=kwargs['n_jobs']
    self.oob_score=kwargs['oob_score']

  def model(self,x,y):
    self.model=RandomForestRegressor(n_estimators=self.n_estimators,min_samples_leaf=self.min_samples_leaf
                               ,max_samples=self.max_samples,max_features=self.max_features
                               ,n_jobs=self.n_jobs,oob_score=self.oob_score)
    self.fit=self.model.fit(x,y)
    return self.fit
  
  def model_rmse(self,x,y):
    #r_mse1=round(math.sqrt(((self.m.predict(x)-y)**2).mean()),6)
    self.root_mse=r_mse(self.fit.predict(x),y)
    return print('mse:  ',self.root_mse)
  
  def model_accuracy(self,x,y):
    return print('accuracy:  ',1-self.root_mse)
  
  def importance(self,x,y):
    imp=importances(self.model,x,y)
    return plot_importances(imp)
  
  def prediction(self,x):
    return self.model.predict(x)


import xgboost as xgb
from xgboost import plot_importance
class xgbreg(Randomforest):
  def __init__(self,**kwargs):
    self.n_estimators =kwargs['n_estimators'] 
    self.max_depth=kwargs['max_depth'] 
    self.learning_rate=kwargs['learning_rate'] 
    self.n_jobs=kwargs['n_jobs']
    self.early_stopping_rounds=kwargs['early_stopping_rounds']
    self.subsample=kwargs['subsample']
  def model(self,x,y):
    self.model=xgb.XGBRegressor(n_estimators=self.n_estimators,learning_rate =self.learning_rate,
                    max_depth =self.max_depth,n_jobs =self.n_jobs,
                    early_stopping_rounds=self.early_stopping_rounds,subsample=self.subsample)
    self.fit=self.model.fit(x,y)
    return self.fit
  def model_rmse(self, x, y):
    return super().model_rmse(x, y)
  def model_accuracy(self, x, y):
    return super().model_accuracy(x, y)
  def prediction(self, x):
    return super().prediction(x)
    #return predict(self.model,x)
  def importance(self):
    return plot_importance(self.model)


class NN():
  def __init__(self,to,layers=[200,100]):
    self.dls=to.dataloaders()
    self.layers=layers
    self.y_range=(int(to.train.y.min()),int(to.train.y.max())+1)
    self.tc = tabular_config(ps=[0.001, 0.01], embed_p=0.04)
    self.callbacks=[SaveModelCallback(monitor='_rmse',comp=np.less,fname='nnlearner')]
    self.learn=tabular_learner(self.dls,y_range=self.y_range,layers=self.layers,config=self.tc,
                              loss_func=F.mse_loss,metrics=rmse)  
  def train(self,lr):
    return self.learn.fit_one_cycle(50,lr,cbs=self.callbacks)

  def predict(self):   
    return self.learn.get_preds()[0]
  def importance(self):
    return PermutationImportance(self.learn)
  

def implement(m,to):
  xs,y=to.train.xs,to.train.y.ravel()
  valid_xs,valid_y=to.valid.xs,to.valid.y.ravel()
  m.model(xs,y)
  print('train>>>')
  m.model_rmse(xs,y)
  m.model_accuracy(xs,y)
  print('valid>>>')
  m.model_rmse(valid_xs,valid_y)
  m.model_accuracy(valid_xs,valid_y)
  return m.prediction(valid_xs)

class PermutationImportance():
  "Calculate and plot the permutation importance"
  def __init__(self, learn:Learner, df=None, bs=None):
    "Initialize with a test dataframe, a learner, and a metric"
    self.learn = learn
    self.df = df if df is not None else None
    bs = bs if bs is not None else learn.dls.bs
    self.dl = learn.dls.test_dl(self.df, bs=bs) if self.df is not None else learn.dls[1]
    self.x_names = learn.dls.x_names.filter(lambda x: '_na' not in x)
    self.na = learn.dls.x_names.filter(lambda x: '_na' in x)
    self.y = learn.dls.y_names
    self.results = self.calc_feat_importance()
    self.plot_importance(self.ord_dic_to_df(self.results))

  def measure_col(self, name:str):
    "Measures change after column shuffle"
    col = [name]
    if f'{name}_na' in self.na: col.append(name)
    orig = self.dl.items[col].values
    perm = np.random.permutation(len(orig))
    self.dl.items[col] = self.dl.items[col].values[perm]
    metric = self.learn.validate(dl=self.dl)[1]
    self.dl.items[col] = orig
    return metric

  def calc_feat_importance(self):
    "Calculates permutation importance by shuffling a column on a percentage scale"
    print('Getting base error')
    base_error = self.learn.validate(dl=self.dl)[1]
    self.importance = {}
    pbar = progress_bar(self.x_names)
    print('Calculating Permutation Importance')
    for col in pbar:
      self.importance[col] = self.measure_col(col)
    for key, value in self.importance.items():
      self.importance[key] = (base_error-value)/base_error #this can be adjusted
    return OrderedDict(sorted(self.importance.items(), key=lambda kv: kv[1], reverse=True))

  def ord_dic_to_df(self, dict:OrderedDict):
    return pd.DataFrame([[k, v] for k, v in dict.items()], columns=['feature', 'importance'])

  def plot_importance(self, df:pd.DataFrame, limit=20, asc=False, **kwargs):
    "Plot importance with an optional limit to how many variables shown"
    df_copy = df.copy()
    df_copy['feature'] = df_copy['feature'].str.slice(0,25)
    df_copy = df_copy.sort_values(by='importance', ascending=asc)[:limit].sort_values(by='importance', ascending=not(asc))
    ax = df_copy.plot.barh(x='feature', y='importance', sort_columns=True, **kwargs)
    for p in ax.patches:
      ax.annotate(f'{p.get_width():.4f}', ((p.get_width() * 1.005), p.get_y()  * 1.005))