In [None]:
import os
import sys
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression as LR
from sklearn import svm
from sklearn.linear_model import ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from multiprocessing import cpu_count
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def load_partitioning_data(train_data):
    
    from sklearn.preprocessing import StandardScaler
    
    y = train_data.iloc[:, -2].values
    X = train_data.drop(['Topt'], axis=1).values

    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(X_train_scaled, y, test_size=0.1, random_state=212)
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    
    return x_train, y_train, x_test, y_test

# elastic_net
def elastic_net(X_train, y_train):
    model=ElasticNetCV(n_jobs=-1)
    model.fit(X_train, y_train)
    
    return model

# bayesridge
def bayesridge(X_train, y_train):
    model = BayesianRidge()
    model.fit(X_train, y_train)
    
    return model

# svr
def svr(X_train, y_train):
    parameters={
                'C':np.logspace(-5,10,num=16,base=2.0),
                'epsilon':[0,0.01,0.1,0.5,1.0,2.0,4.0]
                }
    svr = svm.SVR(kernel='rbf', gamma='auto')
    model = GridSearchCV(svr,parameters,n_jobs=-1,cv=3)
    model.fit(X_train, y_train)
    best_rf_model = model.best_estimator_
    
    return best_rf_model

# Decision Tree
def decisiontree(X_train, y_train):
    parameters={
                'min_samples_leaf':np.linspace(0.01,0.5,10)
                }
    dtr=DecisionTreeRegressor()
    model=GridSearchCV(dtr,parameters,n_jobs=-1,cv=3)
    model.fit(X_train, y_train)
    best_rf_model = model.best_estimator_
    
    return best_rf_model

def random_forest(X_train, y_train):
    parameters = {
                    'max_features':np.arange(0.1,1.1,0.1)
    }
    rf = RandomForestRegressor(n_estimators=1000,verbose =1,random_state = 256)
    model=GridSearchCV(rf, parameters,n_jobs=-1,cv=3)
    model.fit(X_train, y_train)
    best_rf_model = model.best_estimator_
    
    return best_rf_model

def loading(model, X_test, y_test, model_name, exp_type, file_type):
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results_df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    
    outfile = model_name+"_"+exp_type+"_"+file_type+".csv"
    results_df.to_csv(outfile, index=False)
    
    print('R2 in test ({}): {}'.format(model_name, r2))
    print('MAE in test ({}): {}'.format(model_name, mae))
    print('MSE in test ({}): {}'.format(model_name, mse))
    print('RMSE in test ({}): {}'.format(model_name, rmse))
    
    return r2, mae, mse, rmse

def Exp_ML(train_data, exp_type, file_type):
    list_all = []
    X_train, y_train, X_test, y_test = load_partitioning_data(train_data)
    if exp_type == "Search":
        for each in ['elastic_net', 'bayesridge', 'svr', 'decisiontree', 'random_forest']:
            if each == 'elastic_net':
                r2, mae, mse, rmse = loading(elastic_net(X_train, y_train), X_test, y_test, each, exp_type, file_type)
                list_all.append([r2, mae, mse, rmse])

            elif each == 'bayesridge':
                r2, mae, mse, rmse = loading(bayesridge(X_train, y_train), X_test, y_test, each, exp_type, file_type)
                list_all.append([r2, mae, mse, rmse])

            elif each == 'svr':
                r2, mae, mse, rmse = loading(svr(X_train, y_train), X_test, y_test, each, exp_type, file_type)
                list_all.append([r2, mae, mse, rmse])

            elif each == 'decisiontree':
                r2, mae, mse, rmse = loading(decisiontree(X_train, y_train), X_test, y_test, each, exp_type, file_type)
                list_all.append([r2, mae, mse, rmse])

            elif each == 'decisiontree':
                r2, mae, mse, rmse= loading(random_forest(X_train, y_train), X_test, y_test, each, exp_type, file_type)
                list_all.append([r2, mae, mse, rmse])

            else:
                break
    
    elif exp_type == "Comparation":
        r2, mae, mse, rmse = loading(random_forest(X_train, y_train), X_test, y_test, 'random_forest', exp_type, file_type)
        list_all.append([r2, mae, mse, rmse])

    else:
        pass

    return list_all


if __name__ == "__main__":
    '''# Search test
    cultivation_data = pd.read_csv('ML_data/cultivation_Search_exp.csv')
    journals_data = pd.read_csv('ML_data/journals_Search_exp.csv')
    exp_type = "Search"
    '''
    
    # Comparation test
    cultivation_data = pd.read_csv('ML_data/cultivation_Comparation_exp.csv')
    journals_data = pd.read_csv('ML_data/journals_Comparation_exp.csv')
    exp_type = "Comparation" 
    
    final_list = []
    for each in ['cultivation_data', 'journals_data']:
        print("training {}".format(each))
        
        if each == 'journals_data':
            result = Exp_ML(journals_data, exp_type, each)

        elif each == 'cultivation_data':
            result = Exp_ML(cultivation_data, exp_type, each)
        
        else:
            break
            
        final_list.append([result])
    print(final_list)
    