This notebook build predictive models for each municipality

In [3]:
#importer libraries
from sklearn.metrics import mean_squared_error
import os
import tqdm as tqdm
import re
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
# from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import GridSearchCV, learning_curve, KFold, train_test_split
# from sklearn.model_selection import learning_curve
# from sklearn.model_selection import KFold, train_test_split

from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.exceptions import ConvergenceWarning


import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
# a function to add features to data.
def add_features_to_data(df):
    fp = Path("../Feature_data/")
    indk = fp/"indkp101.csv"
    konth = fp/"kontanth.csv"
    areas = fp/"muni_areas.csv"
    pop = fp/"population_data.csv"
    pop_dens = fp/"pop_dens.csv"
    gini = fp/"gini_index.csv"
    unenp = fp/"unemployment_data.csv"
        
    indk = pd.read_csv(indk) # ok
    konth = pd.read_csv(konth) #ok men pr kapita
    gini = pd.read_csv(gini) #OK
    areas = pd.read_csv(areas) # ok
    pop = pd.read_csv(pop) # noget galt
    unenp = pd.read_csv(unenp)
    
    df = df\
        .merge(indk, on=["muni_code", "year"], how = 'left',suffixes=('_left', '_right'))\
        .merge(konth, on= ["muni_code", "year"], how= 'left',suffixes=('_left', '_right'))\
        .merge(gini, on = ["muni_code", "year"], how = 'left',suffixes=('_left', '_right'))\
        .merge(areas, on = ["muni_code"], how = 'left',suffixes=('_left', '_right'))\
        .merge(unenp, on = ["muni_code", "year"], how = 'left',suffixes=('_left', '_right'))\
        .merge(pop, on = ["muni_code","year"], how = 'left',suffixes=('_left', '_right'))\
        .sort_values(["year", "count"])\
        .dropna()\
        .assign(muni_code=lambda x: x['muni_code'].astype('category'))\
        .assign(year=lambda x: x['year'].astype('category'))\
        .assign(housing_type = lambda x: x["housing_type"].astype('category'))\
        .assign(unemployed = lambda x: x["unemployed"]/x['pop'])\
        .assign(kont_recip_tot = lambda x: x["kont_recip_tot"]/x['pop'])\
        .assign(pop_den= lambda x: x['pop']/x['km2'])
    
   

    df['avg_sqm_price'] = pd.to_numeric(df['avg_sqm_price'], errors='coerce')

    #drops very useless columns
    cols_to_drop = [col for col in df.columns if col.startswith('Unnamed')]
    df.drop(columns=cols_to_drop, inplace=True)
    df.drop(columns="count", inplace =True)

    return (df)



A function for fitting a model is made. This function takes a dataset of cleaned Boliga data, and enrich it with the selected features.

In [8]:
def make_a_model(data):
    # splitting data in target values (y) and features (X)
    y = data["avg_sqm_price"]
    X = data.drop(columns=["avg_sqm_price"])
    
    # defines scaler for y-data
    y_scaler = StandardScaler()
    y = y_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
    
    # numeric and categorical features are identified
    numeric_features = X.select_dtypes(include = ["number"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["category"]).columns.tolist()
    
    # Data is split into test and training data, stratified on housing_type
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(.2), random_state=47, stratify=X["housing_type"])
    
    # Known categories in the categorical data are identified and stored for use in OneHotEncoder
    known_categories = [X[i].unique().tolist() for i in X.select_dtypes(include=["category"]).columns.tolist()]
    
    # Preprocessor defined. Numerical features are scaled, and categorical values OneHotEncoded with the
    # known categories
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(with_mean=False), numeric_features),
        ('cat', OneHotEncoder(categories=known_categories), categorical_features)
    ])

    # The training pipeline is defined. Preprocessing as defined above, polynomial feature expansion
    # and Elastic Net as the classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('polynomial', PolynomialFeatures(degree=3)),  # Tilføj denne linje
        ('classifier', ElasticNet())
    ])

    # Paramergrid defined for the gridsearch
    param_grid = {
        'polynomial__degree': [1, 2, 3],  # Ny linje for at prøve forskellige polynomial grader
        'classifier__alpha': np.logspace(-4, 4, 12),
        'classifier__l1_ratio': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
        'classifier__max_iter': [2000] 
    }
    # Setting up the GridSearch with pipeline and parametergrid. 5-fold crossvalidation 
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    
    # Searching for optimal hyperparameters.
    grid_search.fit(X_train, y_train)
    
    # grabbing information about the result
    best_parameters = grid_search.best_params_
    best_pipeline = grid_search.best_estimator_
    
    y_pred = y_scaler.inverse_transform(best_pipeline.predict(X_test).reshape(-1, 1)).flatten()
    y_test_inv = y_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()

    
    rmse = mean_squared_error(y_test_inv, y_pred, squared=False)
    r2 = r2_score(y_test_inv, y_pred)
    coefficients = best_pipeline.named_steps['classifier'].coef_

    # Grabbing names and weights of the polynomial features.
    # First, get names of both numeric and categorical features
    numeric_feature_names = numeric_features
    categorical_feature_names = best_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_feature_names, categorical_feature_names])

    # Now the polynomial feature names
    polynomial_feature_names = best_pipeline.named_steps['polynomial'].get_feature_names_out(input_features=all_feature_names)

    # Combining to one object
    coefs =  zip(coefficients, polynomial_feature_names)
    
    # gets data for a learning curve
    train_sizes, train_scores, test_scores = learning_curve(estimator=best_pipeline,
                   X=X_train,
                   y=y_train,
                   train_sizes=np.arange(0.05, 1.05, .05),
                   scoring='neg_mean_squared_error',                 
                   cv=10)
    
    learning_curve_data = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                     'Test':-test_scores.mean(axis=1),
                     'sample size':train_sizes})
    
    # Finally return fitted models, parameters, metrics, coefficients and data for a learning curve
    return (grid_search, best_parameters, rmse, r2, coefs, learning_curve_data)

    

In [9]:
# Getting aggregated data stored as csv's
fp = Path("../Boliga data/agg_data/")
files = list(fp.glob('*.csv'))

# initialising dataframes for saving results of the fits
metrics = pd.DataFrame(columns=['muni_code', 'rmse', 'r2'])
fitted_models = pd.DataFrame(columns=['muni_code', 'pickled_model'])
learning_curves = pd.DataFrame(columns=['muni_code',  "Train","Test", "sample size"])
coefficients = pd.DataFrame(columns=['muni_code', 'value', 'parameter'])
parameters = pd.DataFrame(columns=['muni_code', 'Parameter', 'Value']) 

data_to_concat = []  # Collect data frames to concatenate

# running the loop for modelling
for filename in tqdm.tqdm(sorted(files)):
    print(filename)
    muni_code = re.search(r'(\d+)\.csv$', str(filename)).group(1)  # extracting muni_code
    data = pd.read_csv(filename)  # reading data
    data = add_features_to_data(data)  # feature adding
    data = data.drop(columns=["year", "muni_code"])  # dropping columns
    grid_search, best_parameters, rmse, r2, coefs, learning_curve_data = make_a_model(data)

    # saving pickled models
    rick = pickle.dumps(grid_search)
    model_row = pd.DataFrame({'muni_code': [muni_code],
                              'pickled_model': [rick]})
    data_to_concat.append(model_row)

    # saving metrics
    metric_tuple = (muni_code, rmse, r2)
    metric_row = pd.DataFrame([metric_tuple], columns=metrics.columns)
    data_to_concat.append(metric_row)

    # saving parameters
    param_row = pd.DataFrame(list(best_parameters.items()), columns=['Parameter', 'Value'])
    param_row['muni_code'] = muni_code
    data_to_concat.append(param_row)

    # saving coefficients
    coef_row = pd.DataFrame(coefs, columns=['value', 'parameter'])
    coef_row['muni_code'] = muni_code
    data_to_concat.append(coef_row)

    # saving learning curve data
    learning_curve_data['muni_code'] = muni_code
    data_to_concat.append(learning_curve_data)

# Concatenate all collected data frames
fitted_models = pd.concat(data_to_concat, ignore_index=True)


  0%|                                                                                           | 0/98 [00:00<?, ?it/s]

..\Boliga data\agg_data\agg_sales_1992_2022_101.csv


  1%|▊                                                                                | 1/98 [01:35<2:34:43, 95.71s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_147.csv


  2%|█▋                                                                               | 2/98 [03:12<2:34:22, 96.48s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_151.csv


  3%|██▍                                                                              | 3/98 [04:45<2:30:18, 94.93s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_153.csv


  4%|███▎                                                                            | 4/98 [12:20<6:11:23, 237.06s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_155.csv


  5%|████                                                                            | 5/98 [23:04<9:54:57, 383.85s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_157.csv


  6%|████▊                                                                          | 6/98 [32:42<11:29:50, 449.89s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_159.csv


  7%|█████▋                                                                         | 7/98 [42:41<12:36:01, 498.48s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_161.csv


  8%|██████▍                                                                        | 8/98 [48:59<11:30:10, 460.12s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_163.csv


  9%|███████▎                                                                        | 9/98 [50:05<8:19:44, 336.90s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_165.csv


 10%|████████                                                                       | 10/98 [51:03<6:08:02, 250.93s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_167.csv


 11%|████████▊                                                                      | 11/98 [52:22<4:47:35, 198.34s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_169.csv


 12%|█████████▋                                                                     | 12/98 [53:44<3:53:20, 162.80s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_173.csv


 13%|██████████▍                                                                    | 13/98 [55:02<3:14:25, 137.24s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_175.csv


 14%|███████████▎                                                                   | 14/98 [56:16<2:45:12, 118.01s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_183.csv


 15%|████████████                                                                   | 15/98 [58:01<2:37:39, 113.96s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_185.csv


 16%|████████████▉                                                                  | 16/98 [59:38<2:28:56, 108.99s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_187.csv


 17%|█████████████▎                                                               | 17/98 [1:01:03<2:17:33, 101.89s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_190.csv


 18%|██████████████▎                                                               | 18/98 [1:02:17<2:04:37, 93.47s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_201.csv


 19%|███████████████                                                               | 19/98 [1:04:06<2:09:05, 98.04s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_210.csv


 20%|███████████████▉                                                              | 20/98 [1:05:23<1:59:24, 91.86s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_217.csv


 21%|████████████████▋                                                             | 21/98 [1:07:17<2:06:21, 98.46s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_219.csv


 22%|█████████████████▌                                                            | 22/98 [1:08:51<2:03:03, 97.15s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_223.csv


 23%|██████████████████▎                                                           | 23/98 [1:10:15<1:56:29, 93.20s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_230.csv


 24%|██████████████████▊                                                          | 24/98 [1:12:19<2:06:14, 102.35s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_240.csv


 26%|███████████████████▋                                                         | 25/98 [1:14:05<2:05:42, 103.32s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_250.csv


 27%|████████████████████▍                                                        | 26/98 [1:16:00<2:08:23, 106.99s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_253.csv


 28%|█████████████████████▏                                                       | 27/98 [1:17:45<2:05:40, 106.20s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_259.csv


 29%|██████████████████████                                                       | 28/98 [1:19:45<2:09:04, 110.64s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_260.csv


 30%|██████████████████████▊                                                      | 29/98 [1:21:49<2:11:33, 114.40s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_265.csv


 31%|███████████████████████▌                                                     | 30/98 [1:23:29<2:04:53, 110.20s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_269.csv


 32%|████████████████████████▎                                                    | 31/98 [1:25:24<2:04:37, 111.61s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_270.csv


 33%|█████████████████████████▏                                                   | 32/98 [1:27:51<2:14:34, 122.33s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_306.csv


 34%|█████████████████████████▉                                                   | 33/98 [1:30:10<2:17:49, 127.22s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_316.csv


 35%|██████████████████████████▋                                                  | 34/98 [1:32:41<2:23:26, 134.47s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_320.csv


 36%|███████████████████████████▌                                                 | 35/98 [1:35:08<2:25:04, 138.17s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_326.csv


 37%|████████████████████████████▎                                                | 36/98 [1:38:04<2:34:19, 149.34s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_329.csv


 38%|█████████████████████████████                                                | 37/98 [1:40:45<2:35:28, 152.93s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_330.csv


 39%|█████████████████████████████▊                                               | 38/98 [1:42:56<2:26:26, 146.44s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_336.csv


 40%|██████████████████████████████▋                                              | 39/98 [1:44:52<2:14:59, 137.28s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_340.csv


 41%|███████████████████████████████▍                                             | 40/98 [1:47:12<2:13:21, 137.96s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_350.csv


 42%|████████████████████████████████▏                                            | 41/98 [1:49:24<2:09:34, 136.39s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_360.csv


 43%|█████████████████████████████████                                            | 42/98 [1:51:23<2:02:14, 130.97s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_370.csv


 44%|█████████████████████████████████▊                                           | 43/98 [1:55:19<2:28:55, 162.46s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_376.csv


 45%|██████████████████████████████████▌                                          | 44/98 [1:57:08<2:11:49, 146.48s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_390.csv


 46%|███████████████████████████████████▎                                         | 45/98 [1:59:22<2:06:13, 142.89s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_400.csv


 47%|████████████████████████████████████▏                                        | 46/98 [2:01:11<1:54:53, 132.56s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_410.csv


 48%|████████████████████████████████████▉                                        | 47/98 [2:03:06<1:48:21, 127.48s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_420.csv


 49%|█████████████████████████████████████▋                                       | 48/98 [2:05:09<1:44:54, 125.89s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_430.csv


 50%|██████████████████████████████████████▌                                      | 49/98 [2:07:04<1:40:19, 122.84s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_440.csv


 51%|███████████████████████████████████████▎                                     | 50/98 [2:08:45<1:32:52, 116.10s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_450.csv


 52%|████████████████████████████████████████                                     | 51/98 [2:10:27<1:27:47, 112.08s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_461.csv


 53%|████████████████████████████████████████▊                                    | 52/98 [2:12:31<1:28:41, 115.68s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_479.csv


 54%|█████████████████████████████████████████▋                                   | 53/98 [2:14:19<1:24:58, 113.29s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_480.csv


 55%|██████████████████████████████████████████▍                                  | 54/98 [2:16:18<1:24:12, 114.84s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_482.csv


 56%|███████████████████████████████████████████▏                                 | 55/98 [2:18:02<1:20:01, 111.67s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_492.csv


 57%|████████████████████████████████████████████                                 | 56/98 [2:19:47<1:16:44, 109.63s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_510.csv


 58%|████████████████████████████████████████████▊                                | 57/98 [2:21:36<1:14:56, 109.67s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_530.csv


 59%|█████████████████████████████████████████████▌                               | 58/98 [2:23:27<1:13:17, 109.95s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_540.csv


 60%|██████████████████████████████████████████████▎                              | 59/98 [2:25:41<1:16:07, 117.12s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_550.csv


 61%|███████████████████████████████████████████████▏                             | 60/98 [2:27:36<1:13:47, 116.51s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_561.csv


 62%|███████████████████████████████████████████████▉                             | 61/98 [2:29:30<1:11:20, 115.69s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_563.csv


 63%|████████████████████████████████████████████████▋                            | 62/98 [2:31:22<1:08:51, 114.77s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_573.csv


 64%|█████████████████████████████████████████████████▌                           | 63/98 [2:33:20<1:07:26, 115.60s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_575.csv


 65%|██████████████████████████████████████████████████▎                          | 64/98 [2:35:09<1:04:21, 113.56s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_580.csv


 66%|███████████████████████████████████████████████████                          | 65/98 [2:37:11<1:03:52, 116.14s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_607.csv


 67%|███████████████████████████████████████████████████▊                         | 66/98 [2:39:20<1:03:57, 119.91s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_615.csv


 68%|████████████████████████████████████████████████████▋                        | 67/98 [2:41:07<1:00:04, 116.29s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_621.csv


 69%|██████████████████████████████████████████████████████▊                        | 68/98 [2:42:54<56:44, 113.48s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_630.csv


 70%|███████████████████████████████████████████████████████▌                       | 69/98 [2:44:54<55:40, 115.19s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_657.csv


 71%|████████████████████████████████████████████████████████▍                      | 70/98 [2:46:36<52:00, 111.46s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_661.csv


 72%|█████████████████████████████████████████████████████████▏                     | 71/98 [2:48:15<48:27, 107.70s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_665.csv


 73%|██████████████████████████████████████████████████████████                     | 72/98 [2:50:08<47:19, 109.21s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_671.csv


 74%|██████████████████████████████████████████████████████████▊                    | 73/98 [2:52:02<46:03, 110.55s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_706.csv


 76%|███████████████████████████████████████████████████████████▋                   | 74/98 [2:53:59<45:04, 112.68s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_707.csv


 77%|████████████████████████████████████████████████████████████▍                  | 75/98 [2:55:46<42:31, 110.93s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_710.csv


 78%|█████████████████████████████████████████████████████████████▎                 | 76/98 [2:57:26<39:24, 107.47s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_727.csv


 79%|██████████████████████████████████████████████████████████████                 | 77/98 [2:59:04<36:42, 104.88s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_730.csv


 80%|██████████████████████████████████████████████████████████████▉                | 78/98 [3:01:00<36:01, 108.09s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_740.csv


 81%|███████████████████████████████████████████████████████████████▋               | 79/98 [3:02:53<34:41, 109.55s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_741.csv


 82%|████████████████████████████████████████████████████████████████▍              | 80/98 [3:04:41<32:41, 108.97s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_746.csv


 83%|█████████████████████████████████████████████████████████████████▎             | 81/98 [3:06:30<30:54, 109.12s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_751.csv


 84%|██████████████████████████████████████████████████████████████████             | 82/98 [3:08:17<28:53, 108.37s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_756.csv


 85%|██████████████████████████████████████████████████████████████████▉            | 83/98 [3:10:23<28:25, 113.71s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_760.csv


 86%|███████████████████████████████████████████████████████████████████▋           | 84/98 [3:12:26<27:09, 116.40s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_766.csv


 87%|████████████████████████████████████████████████████████████████████▌          | 85/98 [3:14:22<25:13, 116.39s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_773.csv


 88%|█████████████████████████████████████████████████████████████████████▎         | 86/98 [3:16:35<24:16, 121.40s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_779.csv


 89%|██████████████████████████████████████████████████████████████████████▏        | 87/98 [3:18:25<21:36, 117.91s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_787.csv


 90%|██████████████████████████████████████████████████████████████████████▉        | 88/98 [3:20:13<19:11, 115.12s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_791.csv


 91%|███████████████████████████████████████████████████████████████████████▋       | 89/98 [3:22:01<16:55, 112.83s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_810.csv


 92%|████████████████████████████████████████████████████████████████████████▌      | 90/98 [3:23:43<14:35, 109.49s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_813.csv


 93%|█████████████████████████████████████████████████████████████████████████▎     | 91/98 [3:25:25<12:30, 107.27s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_820.csv


 94%|██████████████████████████████████████████████████████████████████████████▏    | 92/98 [3:27:24<11:05, 110.90s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_825.csv


 95%|██████████████████████████████████████████████████████████████████████████▉    | 93/98 [3:29:00<08:52, 106.49s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_840.csv


 96%|███████████████████████████████████████████████████████████████████████████▊   | 94/98 [3:30:50<07:10, 107.57s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_846.csv


 97%|████████████████████████████████████████████████████████████████████████████▌  | 95/98 [3:32:36<05:20, 106.91s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_849.csv


 98%|█████████████████████████████████████████████████████████████████████████████▍ | 96/98 [3:34:01<03:20, 100.47s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_851.csv


 99%|███████████████████████████████████████████████████████████████████████████████▏| 97/98 [3:35:13<01:31, 91.88s/it]

..\Boliga data\agg_data\agg_sales_1992_2022_860.csv


100%|███████████████████████████████████████████████████████████████████████████████| 98/98 [3:36:25<00:00, 132.51s/it]


In [None]:

# fitted_models 

parameters.to_csv("fitted_models/parameter_muni.csv")
coefficients.to_csv("fitted_models/coefficients_muni.csv")
metrics.to_csv("fitted_models/metrics_muni.csv")
learning_curves.to_csv("fitted_models/learning_curves_muni.csv")
fitted_models.to_csv("fitted_models/fitted_models_muni.csv")
parameters.to_csv("fitted_models/parameters_muni.csv")

In [None]:
def learning_curve_plot(tester):
    f_learn, ax = plt.subplots(figsize=(7,3))
    ax.plot(tester["sample size"],np.sqrt(tester["Test"]), alpha=0.25, linewidth=2, label ='Test', color='blue') # negated, because we already use neg_MSE
    ax.plot(tester["sample size"],np.sqrt(tester["Train"]), alpha=0.25, linewidth=2, label='Train', color='orange') # negated, because we already use neg_MSE

    ax.set_title('Mean performance')
    ax.set_ylabel('Root-Mean squared error')
    ax.legend();
learning_curve_plot(tester)