In [1]:
#| default_exp regress_ml

# Train ML model

In [2]:
#|hide
from nbdev.showdoc import *

In [29]:
#|hide
#|export
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [69]:
#|export

class TrainRegression():
       
       def __init__(self,
               df_path : str , # path to dataframe to be used to train. File should be CSV file
               ground_truth_col: str, # name of the column with true data to train
               test_size : float , #size of data to be used for test 

               columns_to_remove : list[str]=None , #columns not to use for trainning the model. These columns will be removed.
               ):
             self.df_path = df_path
             self.columns_to_remove = columns_to_remove
             self.ground_truth_col = ground_truth_col
             self.test_size = test_size
             self.x_train, self.x_test, self.y_train, self.y_test =self._load_df_split_data()
             



       def _load_df_split_data(self):
               
               self.df = pd.read_csv(self.df_path)
               #load dataframe
               if self.columns_to_remove!= None:
                     self.df = self.df.drop(self.columns_to_remove,axis=1)
          
               # split to x,y and train test data
               self.x = self.df.drop(self.ground_truth_col,axis=1)
               self.y = self.df[self.ground_truth_col].values

               #split data to train and test
               self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
                      self.x, self.y, test_size=self.test_size, random_state=42)

               return self.x_train, self.x_test, self.y_train, self.y_test



In [71]:
instance = TrainRegression(
  df_path=r"D:\git\ML_projects\nbs\data\resampled_sen2.csv",
  ground_truth_col = "TOC",
  test_size = 0.25,
  columns_to_remove = ['Unnamed: 0.1', 'Unnamed: 0', 'Lon', 'Lat', 'clay', 'silt','sand', 'NI']    
                    )

x_train,x_test, y_train, y_test=instance._load_df_split_data()


In [74]:
y_test

array([0.29, 0.21, 0.15, 0.94, 0.13, 0.53, 0.1 , 0.8 , 0.76, 4.82, 0.35,
       0.32, 1.64, 0.66, 0.33, 0.33, 0.92, 0.33, 0.15, 0.17, 0.17, 0.13,
       0.38, 0.41, 0.48, 0.25, 0.04, 0.57, 0.97, 0.54, 0.57, 0.14, 0.72,
       0.32, 0.47, 0.19, 1.58, 0.04, 0.06, 0.38, 0.5 , 1.59, 0.23, 0.34,
       0.23, 0.45, 0.54, 0.17, 0.06, 0.34, 0.23, 1.24, 0.27])