<a href="https://colab.research.google.com/github/raffieeey/MasterResearchAutoML/blob/master/tpot/freedom/03A_TPOT_Regres.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy scipy scikit-learn pandas joblib
!pip install deap update_checker tqdm stopit
!pip install xgboost
!pip install dask[delayed] dask-ml
!pip install scikit-mdr skrebate
!pip install tpot
!pip install pmlb
!pip install pytictoc



In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
import math
import numpy as np

class MetricsCalc:

    def __init__(self, truth, predictions):
        self.truth = pd.DataFrame(truth)
        self.predictions = predictions
        self._type = 'regression'

    def mae(self):
        return float(mean_absolute_error(self.truth, self.predictions))

    def mse(self):
        return float(mean_squared_error(self.truth, self.predictions))

    def msle(self):
      try:
        return float(mean_squared_log_error(self.truth, self.predictions))
      except:
        return np.nan
    
    def rmse(self):
        return math.sqrt(self.mse())

    def rmsle(self):
        return math.sqrt(self.msle())

    def r2(self):
        return float(r2_score(self.truth, self.predictions))

class CheckNegativeValue:
  def __init__(self, checknegative):
    self._is_negative = "neg" if np.min(checknegative) < 1 else "pos"
  
  def MLMetrics(self):
    if self._is_negative == "neg" :
      return ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
    elif self._is_negative == "pos" :
      return ['neg_mean_squared_log_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

In [6]:
from pmlb import classification_dataset_names, regression_dataset_names,fetch_data
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.model_selection import train_test_split
from pytictoc import TicToc
import pandas as pd

df_result = pd.DataFrame(columns=["dataset_name", "metrics_recorded", "MAE", "MSE","MSLE", "RMSE", "RMSLE", "r2"])

preds = {}
cm_preds = {}
top_algo = {}
t = TicToc()


sel_clss_dtst = ['adult','agaricus-lepiota', 'churn', 'nursery', 'satimage','texture']
sel_rgrs_dtst = ['294_satellite_image','218_house_8L', '227_cpu_small', '503_wind', '344_mv','215_2dplanes']

ix = 0

for rgrs_dtst in sel_rgrs_dtst:
  for metrics_recorded  in ['neg_mean_squared_log_error']:
        print("_"*80)
        print(f"*** Dataset Name: {rgrs_dtst} ***")
        df_result.loc[ix,"dataset_name"] = rgrs_dtst 
        df_result.loc[ix,"metrics_recorded"] = metrics_recorded 
        X, y = fetch_data(rgrs_dtst , return_X_y=True)
        c=CheckNegativeValue(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=69)
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        y_train = pd.DataFrame(y_train).values.ravel()
        y_test = pd.DataFrame(y_test).values.ravel()
        list_metrics = c.MLMetrics()
        tpot = TPOTRegressor(random_state=69, max_time_mins=30, n_jobs=4,  scoring=metrics_recorded if metrics_recorded in list_metrics else None )
        t.tic()
        tpot.fit(X_train,y_train)
        t.toc()
        top_algo[rgrs_dtst] = [v[0] for i, v in tpot.pareto_front_fitted_pipelines_.items()][0]
        print("*** Top algorithm: ***", [v[0] for i, v in tpot.pareto_front_fitted_pipelines_.items()][0])
        preds[rgrs_dtst] = tpot.predict(X_test)
        p = MetricsCalc(y_test,preds[rgrs_dtst])
        df_result.loc[ix,'MAE'] = p.mae()
        df_result.loc[ix,'MSE'] = p.mse()
        df_result.loc[ix,'MSLE'] = p.msle()
        df_result.loc[ix,'RMSE'] = p.rmse()
        df_result.loc[ix,'RMSLE'] = p.rmsle()
        df_result.loc[ix,'r2'] = p.r2()
        ix+=1

________________________________________________________________________________
*** Dataset Name: 294_satellite_image ***
Elapsed time is 1820.096967 seconds.
*** Top algorithm: *** KNeighborsRegressor(n_neighbors=6)
________________________________________________________________________________
*** Dataset Name: 218_house_8L ***
Elapsed time is 1852.436664 seconds.
*** Top algorithm: *** RandomForestRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=3,
                      min_samples_split=5, random_state=69)
________________________________________________________________________________
*** Dataset Name: 227_cpu_small ***
Elapsed time is 1843.300812 seconds.
*** Top algorithm: *** RandomForestRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=3,
                      min_samples_split=7, random_state=69)
________________________________________________________________________________
*** Dataset Name: 503_wind ***
Elapsed time is 1811.236962 seconds.
*** 

In [0]:
for i, val in enumerate(preds):
  print(preds[val])

In [0]:
for i, val in enumerate(preds):
  pd.DataFrame(preds[val]).to_csv(f"{val}.csv")

In [7]:
df_result

Unnamed: 0,dataset_name,metrics_recorded,MAE,MSE,MSLE,RMSE,RMSLE,r2
0,294_satellite_image,neg_mean_squared_log_error,0.29089,0.512509,0.0182923,0.715897,0.135249,0.894068
1,218_house_8L,neg_mean_squared_log_error,15639.6,871583000.0,0.390546,29522.6,0.624936,0.663248
2,227_cpu_small,neg_mean_squared_log_error,1.93083,7.78171,0.00236246,2.78957,0.0486052,0.97547
3,503_wind,neg_mean_squared_log_error,2.32154,8.95707,0.0507667,2.99284,0.225315,0.797502
4,344_mv,neg_mean_squared_log_error,0.0262607,0.00387869,,0.0622791,,0.999964
5,215_2dplanes,neg_mean_squared_log_error,0.790474,0.994172,,0.997082,,0.947961


In [8]:
top_algo

{'215_2dplanes': DecisionTreeRegressor(max_depth=7, min_samples_leaf=11, min_samples_split=16,
                       random_state=69),
 '218_house_8L': RandomForestRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=3,
                       min_samples_split=5, random_state=69),
 '227_cpu_small': RandomForestRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=3,
                       min_samples_split=7, random_state=69),
 '294_satellite_image': KNeighborsRegressor(n_neighbors=6),
 '344_mv': ExtraTreesRegressor(bootstrap=True, max_features=1.0, min_samples_leaf=5,
                     min_samples_split=11, random_state=69),
 '503_wind': XGBRegressor(max_depth=4, min_child_weight=3, nthread=1,
              objective='reg:squarederror', random_state=69,
              subsample=0.6000000000000001)}

In [0]:
from google.colab import files
df_result.to_csv("data_1.csv")
#files.download('data_1.csv')



with open('data_1.txt', 'w') as f:
    f.write(str(top_algo))

#files.download('data_1.txt')