In [None]:
#| default_exp regress_ml

# Train ML model

In [1]:
#|hide
from nbdev.showdoc import *

In [4]:
#|hide
#|export
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from ML_projects import const_vals as CONST
# from ML_projects import rf_reg as rf_reg


import warnings
warnings.filterwarnings('ignore')

In [5]:
#|export 
class TrainRegression():
       
       def __init__(self,
               df_path : str , # path to dataframe to be used to train. File should be CSV file
               requested_model : str , # model to train. Options : 'RFR' 'XGB' 'SVR' 'RIDGE' 'KNEIGHBORS' 'GRADIENT_BOOST' 'ADA'
               ground_truth_col: str, # name of the column with true data to train
               test_size : float , #size of data to be used for test 
               hyper_method : str , #hyperparameter tunning method. accepts : 'randomized' 'bayesian' , 'bayesian continous'
               columns_to_remove : list[str]=None , #columns not to use for trainning the model. These columns will be removed.
              #  hyper_params : dict = CONST.RANDOM_GRID_RFR, #parameters for hyperparameter tunning
              #  space : list = CONST.SPACE_RFR  , #
               ):
             self.df_path = df_path
             self.columns_to_remove = columns_to_remove
             self.ground_truth_col = ground_truth_col
             self.test_size = test_size
             self.hyper_method = hyper_method

             #load data and get train test data
             self.x_train, self.x_test, self.y_train, self.y_test = self._load_df_split_data()
             
             # create initial model and match the params 

             self.model , self.params = self._match_models_()
       

       def _match_models_(self):
            self.model = CONST.algorithm_to_model[self.model_str]
            self.params = CONST.algorithm_to_params[self.model_str]

            return self.model , self.params


       def _load_df_split_data(self):
               
               self.df = pd.read_csv(self.df_path)
               #load dataframe
               if self.columns_to_remove!= None:
                     self.df = self.df.drop(self.columns_to_remove,axis=1)
          
               # split to x,y and train test data
               self.x = self.df.drop(self.ground_truth_col,axis=1)
               self.y = self.df[self.ground_truth_col].values

               #split data to train and test
               self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
                      self.x, self.y, test_size=self.test_size, random_state=42)

               return self.x_train, self.x_test, self.y_train, self.y_test
       



 




In [None]:
instance = TrainRegression(
  df_path=r"D:\git\ML_projects\nbs\data\resampled_sen2.csv",
  ground_truth_col = "TOC",
  test_size = 0.25,
  columns_to_remove = ['Unnamed: 0.1', 'Unnamed: 0', 'Lon', 'Lat', 'clay', 'silt','sand', 'NI'],
  hyper_method = 'bayesian'    
  
                    )
# test = instance.the_best_model
# print(instance.the_best_params)


In [None]:
print(test)