In [1]:
# import models from tpot
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import OrdinalEncoder
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np
from sklearn import metrics as met


def preprocess_data(df):
    df = df.drop(['name', 'code'], axis=1)

    Y = df['pop_growth']
    X = df.drop('pop_growth', axis=1)
    return X, Y


In [2]:
score_function_reg = {'mse': lambda y_true, y_pred: met.mean_squared_error(y_true, y_pred, squared=True), 
    'mae': met.mean_absolute_error, 
    'r2': met.r2_score, 
    'rmse': lambda y_true, y_pred: met.mean_squared_error(y_true, y_pred, squared=False), 
    'msle': met.mean_squared_log_error}

def try_TPOT_reg(X, y, metrics = ['mse'], test_size=0.2, save=True, save_path=None, generations=5, population_size=20, cv=5,
                    random_state=42, verbosity=2):
    
    # the dataset passed is assumed to be ready to be processed
    # all its features are numerical and all its missing values are imputed/discarded

    # in case no save path is specified and save==True
    if save and save_path is None:
       raise ValueError("Please pass a path to save the model or set the 'save' parameter to False")

    # in case one metric is added as a string
    if isinstance(metrics, str):
        metrics = [metrics]

    if 'msle' in metrics and (y <= 0).any():
        # msle cannot be used for target variables with non-positive values
        metrics.remove('msle')
    
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=11)
    # Standard TPOT Regressor Initializer
    reg = TPOTRegressor(generations=generations, population_size=population_size, cv=cv,
                                    random_state=random_state, verbosity=verbosity)
    
    # fitting to the data
    reg.fit(X_train, y_train)
    # predicting outcome of test data
    y_pred = reg.predict(X_test)
    # calculating and storing accuracy
    scores = dict(list(zip(metrics, [round(score_function_reg[m](y_test, y_pred), 3) for m in metrics])))
    
    # exporting the model to the passed path
    reg.export('tpot_model_reg.py', save_path)

    return reg, scores


In [3]:
score_function_clf = {"acc": met.accuracy_score, 
    "f1": met.f1_score, 
    "bal_acc": met.balanced_accuracy_score, 
    'precision': met.precision_score, 
    "recall": met.recall_score}

def try_TPOT_clf(X, y, metrics = ['acc'], test_size=0.2, save=True, save_path=None, generations=5, population_size=20, cv=5,
                    random_state=42, verbosity=2):
    
    # the dataset passed is assumed to be ready to be processed
    # all its features are numerical and all its missing values are dealt with

    # in case no save path is specified and save==True
    if save and save_path is None:
       raise ValueError("Please pass a path to save the model or set the 'save' parameter to False")

    # in case one metric is added as a string
    if isinstance(metrics, str):
        metrics = [metrics]

    if 'precision' in metrics and (len(np.unique(y)) > 2):
        # precision will not be used for multi-class target variables
        metrics.remove('precision')
    
    if 'recall' in metrics and (len(np.unique(y)) > 2):
        # recall will not be used for multi-class target variables
        metrics.remove('recall')

    if 'f1' in metrics and (len(np.unique(y)) > 2):
        #f1 score will not be used for multi-class target variables
        metrics.remove('f1')
    
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=11)

    # Standard TPOT Regressor Initializer
    clf = TPOTClassifier(generations=generations, population_size=population_size, cv=cv,
                                    random_state=random_state, verbosity=verbosity)
    
    # fitting to the data
    clf.fit(X_train, y_train)
    # predicting outcome of test data
    y_pred = clf.predict(X_test)
    # calculating and storing accuracy
    scores = dict(list(zip(metrics, [round(score_function_clf[m](y_test, y_pred), 3) for m in metrics])))
    
    # exporting the model to the passed path
    clf.export('tpot_model_clf.py', save_path)

    return clf, scores


In [5]:
df = pd.read_excel('final_dataset.xlsx', index_col=0)

X, Y = preprocess_data(df)

lr, results = try_TPOT_reg(X, Y, save=False, save_path='', metrics=['mse', 'mae', 'r2', 'msle', 'rmse'])

print(results)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.12007383403845165

Generation 2 - Current best internal CV score: -0.12007383403845165

Generation 3 - Current best internal CV score: -0.12007383403845165

Generation 4 - Current best internal CV score: -0.12007383403845165

Generation 5 - Current best internal CV score: -0.12007383403845165

Best pipeline: LassoLarsCV(input_matrix, normalize=True)
{'mse': 0.168, 'mae': 0.162, 'r2': 0.91, 'rmse': 0.41}


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9574999999999999


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(MLPClassifier(input_matrix, alpha=0.1, learning_rate_init=0.001), bootstrap=False, criterion=gini, max_features=0.9500000000000001, min_samples_leaf=17, min_samples_split=5, n_estimators=100)
{'acc': 0.95, 'bal_acc': 0.948}
