# Mixed Effects Models

In this notebook, we'll use the `statsmodels` library to fit mixed effects models to our data.

In [1]:
# import libraries
import altair as alt
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# import other functions
from imputer import *
from feature_eng import *
from drop import *
from preprocessing_old import preprocessing_na

## Preprocessing

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [3]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)
X_train = X_train.drop(columns=['monthly_rain', 'monthly_avg_rain_length'])
X_valid = X_valid.drop(columns=['monthly_rain', 'monthly_avg_rain_length'])

#perform scaling of the numerical variables
categorical_features = X_train.loc[:, X_train.dtypes == "object"]
categorical_features.columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns = list(categorical_features.columns)))
X_valid_scaled = scaler.transform(X_valid.drop(columns = list(categorical_features.columns)))

X_train = pd.concat([pd.DataFrame(X_train_scaled, 
                                  index=X_train.index, 
                                  columns=X_train.drop(columns = list(categorical_features.columns)).columns), 
                     X_train[list(categorical_features.columns)]], axis=1)
X_valid = pd.concat([pd.DataFrame(X_valid_scaled, 
                                  index=X_valid.index, 
                                  columns=X_train.drop(columns = list(categorical_features.columns)).columns), 
                     X_valid[list(categorical_features.columns)]], axis=1)


# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid, ['income_class', 'density_class'])
X_train = X_train_valid[0]
X_valid = X_train_valid[1]


print(X_train.shape)
print(X_valid.shape)

In [63]:
#concatenate X and y for the train and validation set
valid = pd.concat([X_valid, y_valid], axis=1)
train = pd.concat([X_train, y_train], axis=1)

0


Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,historic_ws_moderate_br,avg_fertility_rate,climate,HI,LI,MI,HD,LD,MD,unacast_session_count
12721,-0.689389,-0.908168,0.963528,1.341227,0.898323,1.714020,1.097847,1.336182,-0.110446,-0.008991,...,-0.109196,-0.412050,A,0,0,1,1,0,0,387.0
17871,0.286883,-0.908168,15.243832,2.912954,16.621935,2.993407,3.038597,1.150586,0.089642,-0.008991,...,0.195835,-0.999021,C,0,0,1,1,0,0,56.0
46441,-0.038541,-0.908168,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,-0.414227,-0.162044,D,1,0,0,1,0,0,31.0
48833,1.588578,-0.908168,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,0.297512,2.756700,A,0,0,1,1,0,0,131.0
50069,0.286883,1.101118,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,-0.414227,0.300194,C,0,0,1,0,1,0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36902,-0.689389,-0.908168,1.396265,1.341227,1.381065,1.170907,-0.078055,-0.044831,0.132211,-0.008991,...,0.907574,2.686932,C,0,1,0,1,0,0,62.0
5882,0.612307,-0.908168,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,-0.414227,-0.999021,C,1,0,0,0,0,1,71.0
22158,1.914002,-0.908168,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,-0.210873,0.834666,D,0,0,1,0,1,0,29.0
35672,-0.363965,-0.908168,-0.211042,-0.230500,-0.205089,-0.280968,-0.221113,-0.215642,-0.110446,-0.008991,...,-0.414227,-0.724963,C,0,1,0,1,0,0,156.0


In [64]:
# delete the columns with a variance less than 0.001

def variance_threshold_selector(data, threshold=0.5):
    # https://stackoverflow.com/a/39813304/1956309
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# min_variance = .9 * (1 - .9)  # You can play here with different values.
min_variance = 0.001
low_variance = variance_threshold_selector(train.drop(columns=['climate']), min_variance) 

In [65]:
train_var = pd.concat([low_variance, train[['climate']]], axis=1)

In [66]:
train_var.shape

(39592, 629)

In [67]:
#Select the same columns in the train set and in the validation set
valid_var = valid[list(train_var.columns)]
valid_var = valid_var.drop(columns=['unacast_session_count'])
valid_var.shape

(9898, 628)

In [68]:
#Find the columns that make the fit function fail
# i and to_drop were updated with the last values the function would gave everytime the connection with the kernel would stop and the algorithm would stop running
i= 142
to_drop=['unacast_session_count', 'monthly_repeated_sessions', 'B23008e21', 'B23008e23', 'B12001e11', 'B23008e26', 
         'B11005e14', 'B11005e17', 'B20004e9', 'B19101e7', 'B19101e4', 'B12001e15', 'B09002e19', 'B23008e27', 'B20004e7', 
         'B11016e10', 'B17012e31', 'B19101e8', 'C18108e5', 'B25012e8', 'B25012e15', 'B17020e4', 'B25012e16', 'B25012e10', 
         'B25012e6', 'B09002e17', 'B25012e12', 'B25012e11', 'B08301e6', 'B17012e6', 'B19101e6', 'B25012e4', 'B19101e12', 
         'B17020e5', 'B23008e18', 'B19101e5', 'B12001e16', 'B08303e12', 'B17020e11', 'B10010e1', 'B19101e11', 'B08303e13', 
         'B16007e5', 'B19101e16', 'B11016e12', 'B17020e3', 'B19101e3', 'B25012e14', 'B09002e18', 'B17012e2', 'B19101e2', 
         'B25012e5', 'B17012e5', 'B10010e2', 'B11005e4', 'B19101e15', 'B11005e5', 'B19101e17', 'B17012e4', 'B17012e7', 
         'B20004e8', 'B17020e10', 'B09018e8', 'B20004e6', 'B17020e6', 'B10002e5', 'B11016e11', 'B10002e3', 'B17012e26', 
         'B25012e3', 'B11005e10', 'B08301e5', 'B17012e25', 'B17012e20', 'B08301e4', 'B19101e13', 'B10002e4', 'B23008e13', 
         'C18108e3', 'B09018e7', 'B11001e9', 'B10002e1', 'C18108e4', 'B23008e19', 'B10010e3', 'B16007e7', 'B23008e12', 
         'B15003e23', 'B15003e25', 'B11005e6', 'B11003e7', 'B27003e1', 'B15003e20', 'B09018e1', 'B11016e3', 'B01001e4', 
         'B11016e2', 'B15003e21', 'B15003e22', 'B19083e1', 'B15003e24', 'B23020e1', 'B11016e4', 'B09018e5', 'B19125e2', 
         'B16007e3', 'B01001e28', 'B23020e3', 'B23020e2', 'B12001e6', 'B27001e30', 'B01001e29', 'B25103e2', 'B16007e4', 
         'B12001e9', 'B23008e10', 'B09018e4', 'B01001e30', 'B11003e5', 'B23008e2', 'B23008e5', 'B09002e16', 'B25012e9', 
         'B01002e3', 'B20004e5', 'B25012e2', 'B08303e1', 'B23025e4', 'B23008e4', 'B23025e7', 'B12001e1', 'B15003e17', 
         'B11005e7', 'B01001e3', 'B23025e2', 'B09002e8', 'B22003e5', 'B08301e1']

columns=list(train_var.columns)
# remove the columns that have been dropped during the previous iterations of the program, before the connection with the kernel stopped
for name in to_drop:
    columns.remove(name)

#ML_MAE is going to store the MAE every time the mixed effects model runs
ML_MAE = {'number_col':[], 'MAE':[]}

while i < (train_var.shape[1] - len(to_drop)):
    #col contains the features we want to train our model on
    col = columns[:i+1]
    col = '+'.join(col)
    print(i)
    #run the mixed effect model
    try:
        md = smf.mixedlm("unacast_session_count ~ " + col, train_var, groups=train_var["climate"])
        mdf = md.fit(method=['bfgs', 'lbfgs', 'cg', 'bfgs'])
    #if the model throws an error when it runs
    except np.linalg.LinAlgError:
        #drop the last feature that was added
        to_drop.append(columns[i])
        print('removed', to_drop[1:])
        columns.remove(columns[i])
    else:
        MAE = metrics.mean_absolute_error(y_train, mdf.predict(train_var))
        ML_MAE['number_col'].append(i)
        ML_MAE['MAE'].append(MAE)
        i +=1

142


KeyboardInterrupt: 

In [72]:
#Delete the columns found in the previous cell from the list of features
to_delete = ['monthly_repeated_sessions', 'B23008e21', 'B23008e23', 'B12001e11', 'B23008e26', 'B11005e14', 'B11005e17', 
             'B20004e9', 'B19101e7', 'B19101e4', 'B12001e15', 'B09002e19', 'B23008e27', 'B20004e7', 'B11016e10', 'B17012e31', 
             'B19101e8', 'C18108e5', 'B25012e8', 'B25012e15', 'B17020e4', 'B25012e16', 'B25012e10', 'B25012e6', 'B09002e17',
             'B25012e12', 'B25012e11', 'B08301e6', 'B17012e6', 'B19101e6', 'B25012e4', 'B19101e12', 'B17020e5', 'B23008e18', 
             'B19101e5', 'B12001e16', 'B08303e12', 'B17020e11', 'B10010e1', 'B19101e11', 'B08303e13', 'B16007e5', 'B19101e16', 
             'B11016e12', 'B17020e3', 'B19101e3', 'B25012e14', 'B09002e18', 'B17012e2', 'B19101e2', 'B25012e5', 'B17012e5', 
             'B10010e2', 'B11005e4', 'B19101e15', 'B11005e5', 'B19101e17', 'B17012e4', 'B17012e7', 'B20004e8', 'B17020e10', 
             'B09018e8', 'B20004e6', 'B17020e6', 'B10002e5', 'B11016e11', 'B10002e3', 'B17012e26', 'B25012e3', 'B11005e10', 
             'B08301e5', 'B17012e25', 'B17012e20', 'B08301e4', 'B19101e13', 'B10002e4', 'B23008e13', 'C18108e3', 'B09018e7', 
             'B11001e9', 'B10002e1', 'C18108e4', 'B23008e19', 'B10010e3', 'B16007e7', 'B23008e12', 'B15003e23', 'B15003e25', 
             'B11005e6', 'B11003e7', 'B27003e1', 'B15003e20', 'B09018e1', 'B11016e3', 'B01001e4', 'B11016e2', 'B15003e21', 
             'B15003e22', 'B19083e1', 'B15003e24', 'B23020e1', 'B11016e4', 'B09018e5', 'B19125e2', 'B16007e3', 'B01001e28', 
             'B23020e3', 'B23020e2', 'B12001e6', 'B27001e30', 'B01001e29', 'B25103e2', 'B16007e4', 'B12001e9', 'B23008e10', 
             'B09018e4', 'B01001e30', 'B11003e5', 'B23008e2', 'B23008e5', 'B09002e16', 'B25012e9', 'B01002e3', 'B20004e5', 
             'B25012e2', 'B08303e1', 'B23025e4', 'B23008e4', 'B23025e7', 'B12001e1', 'B15003e17', 'B11005e7', 'B01001e3', 
             'B23025e2', 'B09002e8', 'B22003e5', 'B08301e1', 'B14002e1', 'B09002e11', 'B27001e2', 'B25064e1', 'B25001e1', 
             'B19058e2', 'B09001e5', 'B09002e10', 'B09002e13', 'B09002e12', 'B19301e1', 'B25119e2', 'B19125e3', 'B13016e4', 
             'B11001e5', 'B25119e3', 'B15003e18', 'B01001e27', 'B13016e5', 'B11005e9', 'B20004e2', 'B15003e1', 'B10001e4', 
             'B11001e6', 'B13016e7', 'B12001e7', 'B23008e9', 'B13016e6', 'B23008e7', 'B23008e6', 'B09002e14', 'B09001e10', 
             'B11005e8', 'B13016e3', 'male_60_69', 'male_80_over', 'female_22_29', 'female_30_39', 'female_40_49', 
             'female_50_59', 'female_60_69', 'female_70_79', 'female_80_over', 'total_enrolled', 'not_enrolled', 
             'enrolled_nursery_pre_private', 'enrolled_kinder_public', 'enrolled_kinder_private', 'enrolled_grades_1_4_public', 
             'enrolled_grades_1_4_private', 'enrolled_grades_5_8_public', 'enrolled_grades_9_12_private', 
             'enrolled_undergrad_public', 'enrolled_undergrad_private', 'enrolled_graduate_public', 'less_than_12_no_diploma', 
             'travel_15_minutes_less', 'travel_15_29_minutes', 'travel_30_44_minutes', 'single_under_6', 'single_0_to_17', 
             'single_6_to_17', 'single_no_kids', 'four_or_more_in_family_household', 'four_or_more_in_nonfamily_household', 
             'men_without_health_insurance', 'women_with_health_insurnace', 'women_without_health_insurance', 
             'avg_birth_weight', 'avg_age_of_mother', 'longitude', 'latitude', 'alcohol', 'amenity', 'bank', 'bar', 'cafe', 
             'camp_site', 'car_repair', 'childcare', 'clothes_store', 'convenience_store', 'fast_food', 'fire_station', 
             'fitness_or_sports_centre', 'fuel', 'healthcare', 'hotel', 'museum_or_gallery', 'restaurant', 'shop', 
             'supermarket', 'tourism', 'distance_to_U', 'distance_to_nearest_school', 'walk_score', 'bike_score', 'k_avg', 
             'streets_per_node_avg', 'edge_length_total', 'edge_length_avg', 'street_length_avg', 
             'streets_per_node_counts_2', 'streets_per_node_counts_3', 'streets_per_node_counts_5', 'n_osdw', 
             'k_avg_osdw', 'streets_per_node_avg_osdw', 'edge_length_total_osdw', 'streets_per_node_counts_1_osdw', 
             'streets_per_node_counts_3_osdw', 'streets_per_node_counts_5_osdw', 'intersection_count_osid', 
             'edge_length_total_osid', 'edge_length_avg_osid', 'street_length_total_osid', 'street_segments_count_osid', 
             'streets_per_node_counts_1_osid', 'streets_per_node_counts_2_osid', 'streets_per_node_counts_4_osid', 
             'streets_per_node_counts_5_osid', 'violent_crime', 'criminal_homicide', 'rape', 'robbery', 
             'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft', 'motor_vehicle_theft', 
             'houses_per_sq_km', 'historic_number_of_sessions', 'historic_sessions_per_day', 'historic_unique_sessions', 
             'historic_unique_sessions_per_day', 'historic_repeat_sessions', 'historic_repeat_sessions_per_day', 
             'historic_total_session_length', 'historic_avg_session_length', 'historic_avg_light_activity', 
             'historic_avg_moderate_activity', 'historic_avg_vigorous_activity', 'historic_avg_mod_plus_vig', 
             'historic_hour_7', 'historic_hour_8', 'historic_hour_9', 'historic_hour_10', 'historic_hour_11', 
             'historic_hour_13', 'historic_hour_14', 'historic_hour_15', 'historic_hour_16', 'historic_hour_17', 
             'historic_hour_19', 'historic_hour_20', 'historic_rain', 'historic_foggy', 'historic_snow', 'Green_2016', 
             'Libertarians_2016', 'Poor_physical_health_days', 'Adult_smoking', 'Adult_obesity', 'weather_clear', 
             'weather_rain', 'weather_fog', 'temp_avg_35_below', 'temp_max_35_below', 'temp_max_45_55', 
             'state_amount_per_capita', 'historic_slide_count_comb', 'monthly_climb_count_comb', 'monthly_tube_count_comb', 
             'historic_tube_count_comb', 'monthly_overhang_count_comb', 'historic_overhang_count_comb', 
             'monthly_bridge_count_comb', 'historic_bridge_count_comb', 'monthly_swing_count_comb', 
             'historic_swing_count_comb', 'historic_obsta_count_comb', 'historic_crawls_count_comb', 
             'monthly_hour_night', 'historic_hour_night', 'avg_wind_calm', 'avg_wind_light_air', 'avg_wind_light_br', 
             'avg_wind_gentle_br', 'avg_wind_moderate_br', 'monthly_ws_calm', 'monthly_ws_light_air', 'monthly_ws_light_br', 
             'monthly_ws_gentle_br', 'monthly_ws_moderate_br', 'historic_ws_calm', 'historic_ws_light_air', 
             'historic_ws_light_br', 'historic_ws_gentle_br', 'historic_ws_moderate_br', 'avg_fertility_rate', 'HI', 'LI', 
             'MI', 'HD', 'LD', 'MD']
train_var = train_var.drop(columns = to_delete)
valid_var = valid_var.drop(columns=to_delete)

In [73]:
len(list(train_var.columns))

286

In [74]:
len(to_delete)

343

In [75]:
#The new set of columns we are going to work with
cols =  '+'.join(list(train_var.columns)[:-2])
cols

'month+year+monthly_number_of_sessions+monthly_unique_sessions+monthly_avg_length_of_session+monthly_avg_light_activity+monthly_avg_moderate_activity+monthly_avg_vigorous_activity+monthly_count_ramp+monthly_count_zipline+monthly_count_spinner+monthly_count_pull_under+monthly_under_40+monthly_temp_40_to_50+monthly_temp_50_to_60+monthly_temp_60_to_70+monthly_temp_70_to_80+monthly_temp_80_to_90+monthly_over_90+monthly_avg_length_under_40+monthly_avg_length_temp_40_to_50+monthly_avg_length_temp_50_to_60+monthly_avg_length_temp_60_to_70+monthly_avg_length_temp_70_to_80+monthly_avg_length_temp_80_to_90+monthly_avg_length_over_90+monthly_cloudy+monthly_clear+monthly_foggy+monthly_snow+monthly_avg_cloudy_length+monthly_avg_clear_length+monthly_avg_foggy_length+monthly_avg_snow_length+monthly_hour_7+monthly_hour_8+monthly_hour_9+monthly_hour_10+monthly_hour_11+monthly_hour_12+monthly_hour_13+monthly_hour_14+monthly_hour_15+monthly_hour_16+monthly_hour_17+monthly_hour_18+monthly_hour_19+monthly_

In [76]:
list(train_var.columns)[-2:]

['unacast_session_count', 'climate']

In [78]:
categorical_features = train_var.loc[:, train_var.dtypes == "object"]

In [79]:
valid_var.shape

(9898, 285)

In [80]:
train_var.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39592 entries, 28601 to 42347
Columns: 286 entries, month to climate
dtypes: float64(285), object(1)
memory usage: 87.9+ MB


In [81]:
#used `climate` as the cluster variable
md = smf.mixedlm('unacast_session_count ~ month+year', data = train_var, groups=train_var["climate"])
mdf = md.fit(method=['bfgs', 'lbfgs', 'cg', 'bfgs'])

### RMSE

In [82]:
#train RMSE
ML_RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, mdf.predict(train_var)))
print(ML_RMSE_train)

268.8478185165291


In [83]:
#Validation RMSE
ML_RMSE_valid = np.sqrt(metrics.mean_squared_error(y_valid, mdf.predict(valid_var)))
print(ML_RMSE_valid)

264.7209778392388


### MAE

In [95]:
#Train MAE
ML_MAE_train = metrics.mean_absolute_error(y_train, mdf.predict(train_var))
print(ML_MAE_train)

115.81333843150244


In [96]:
#Validation MAE
ML_MAE_valid = metrics.mean_absolute_error(y_valid, mdf.predict(valid_var))
print(ML_MAE_valid)

114.75976013264663


In [100]:
def plot_resid(model, X_train=None, y_train=None, X_valid=None, y_valid=None, plot = 'both'):
    d = dict()
    if plot != 'valid':
        train_df = pd.DataFrame({'Predicted Train':model.predict(X_train), 'True Train':y_train})
        train_df['Train Error Distance'] =  train_df['Predicted Train'] - train_df['True Train']
        train_df['Train Error Proportion'] =  train_df['Predicted Train']/train_df['True Train']
        train_dist = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Distance:Q'))
        train_prop = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Proportion:Q'))
        d["Train_Distance"] = train_dist
        d["Train_Proportion"] = train_prop
    else:
        d["Valid_Distance"] = "No training set inputted"
        d["Valid_Proportion"] = "No training set inputted"
    if plot != 'train':
        valid_df = pd.DataFrame({'Predicted Valid':model.predict(X_valid), 'True Valid':y_valid})
        valid_df['Valid Error Distance'] =  valid_df['Predicted Valid'] - valid_df['True Valid']
        valid_df['Valid Error Proportion'] =  valid_df['Predicted Valid']/valid_df['True Valid']
        valid_dist = alt.Chart(valid_df).mark_circle().encode(alt.X("True Valid:Q"), y=alt.Y('Valid Error Distance:Q'))
        valid_prop = alt.Chart(valid_df).mark_circle().encode(alt.X("True Validation:Q"), y=alt.Y('Validation Error Proportion:Q'))
        d["Valid_Distance"] = valid_dist
        d["Valid_Proportion"] = valid_prop
    else:
        d["Valid_Distance"] = "No validation set inputted"
        d["Valid_Proportion"] = "No validation set inputted"
    return d

In [102]:
plot = plot_resid(mdf, X_train=train_var, y_train=y_train, X_valid=valid_var, y_valid=y_valid)

In [104]:
#plot the training error distribution
alt.data_transformers.disable_max_rows()
plot['Train_Distance']

**Conclusion** : So fare, I only ran the MLE model grouping by the 'climate' feature, but the results were not very good (validation RMSE of 264 and validation MAE of 114).