In [59]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import ast
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
import seaborn as sns
from scipy import stats
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import root_mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, expon
from sklearn.model_selection import train_test_split
from sklearn import set_config

set_config(transform_output='pandas')

sns.set_theme(style="whitegrid")
sns.set(font_scale = 1)

In [34]:
IMAGES_PATH = Path().resolve().parent / "images"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [35]:
data_path = Path().resolve().parent / "data"
data = pd.read_csv(data_path/"arbres_grenoble_epsg4326.csv")
arbres_df = data[data['anneedeplantation'].notnull()].reset_index(drop=True)
anne_null = data[data['anneedeplantation'].isnull()].reset_index(drop=True)
train_set, test_set = train_test_split(arbres_df, test_size=0.2, random_state=42)
train_set.reset_index(inplace=True, drop=True)
test_set.reset_index(inplace=True, drop=True)
train_feat = train_set.drop("anneedeplantation", axis=1).reset_index(drop=True)
train_target = train_set["anneedeplantation"].copy().reset_index(drop=True)

In [74]:
def geo_point_name(function_transformer, feature_names_in):
    return ["lat", "lon"]  # feature names out

def geo_point_transformer(X):
    X_copy = X.copy()
    X_copy[["lat", "lon"]] = [ast.literal_eval(x)[:2] for x in X_copy["geo_point_2d"]]
    return X_copy[["lat", "lon"]]

def geo_point_processor():
    return make_pipeline(
        FunctionTransformer(geo_point_transformer, feature_names_out=geo_point_name),
        StandardScaler()
    )

def stad_dev_name(function_transformer, feature_names_in):
    return ["stadededeveloppement"]  # feature names out

def stad_dev_transformer(X):
    X_copy = X.copy()
    X_copy["stadededeveloppement"] = X_copy["stadededeveloppement"].fillna(value="Arbre adulte")
    return X_copy[["stadededeveloppement"]]

def stad_dev_processor():
    return make_pipeline(
        FunctionTransformer(stad_dev_transformer, feature_names_out=stad_dev_name),
        )

class_order_stad = [["Arbre jeune", "Arbre adulte", "Arbre vieillissant"]]

ord_stad_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=class_order_stad, handle_unknown='use_encoded_value', unknown_value=-1)
    )

class_order_haut = [["Moins de 10 m", "de 10 m à 20 m", "Plus de 20 m"]]

ord_haut_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=class_order_haut, handle_unknown='use_encoded_value', unknown_value=-1)
    )

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    )

def portarbre_name(function_transformer, feature_names_in):
    return ["portarbre", "adr_secteur"]  # feature names out

def portarbre_transformer(X):
    X_copy = X.copy()
    mode_df = X_copy.groupby("adr_secteur")["portarbre"].agg(lambda x: x.mode().max()).reset_index()
    mode_df.columns = ["adr_secteur", "mode_portarbre"]
    for index, row in mode_df.iterrows():
        feat_value = row["adr_secteur"]
        mode_value = row["mode_portarbre"]
        X_copy.loc[X_copy["adr_secteur"] == feat_value, "portarbre"] = \
            X_copy.loc[X_copy["adr_secteur"] == feat_value, "portarbre"].fillna(value=mode_value)
        X_copy[["portarbre", "adr_secteur"]] = X_copy[["portarbre", "adr_secteur"]]
    return X_copy[["portarbre", "adr_secteur"]]

def portarbre_processor():
    return make_pipeline(
        FunctionTransformer(portarbre_transformer, feature_names_out=portarbre_name),
        )

def raison_name(function_transformer, feature_names_in):
    return ["raisondeplantation", "sous_categorie"]  # feature names out

def raison_transformer(X):
    X_copy = X.copy()
    mode_df = X_copy.groupby("sous_categorie")["raisondeplantation"].agg(lambda x: x.mode().max()).reset_index()
    mode_df.columns = ["sous_categorie", "mode_raison"]
    for index, row in mode_df.iterrows():
        feat_value = row["sous_categorie"]
        mode_value = row["mode_raison"]
        X_copy.loc[X_copy["sous_categorie"] == feat_value, "raisondeplantation"] = \
            X_copy.loc[X_copy["sous_categorie"] == feat_value, "raisondeplantation"].fillna(value=mode_value)
        X_copy[["raisondeplantation", "sous_categorie"]] = X_copy[["raisondeplantation", "sous_categorie"]]
    return X_copy[["raisondeplantation", "sous_categorie"]]

def raison_processor():
    return make_pipeline(
        FunctionTransformer(raison_transformer, feature_names_out=raison_name),
        )

def collect_name(function_transformer, feature_names_in):
    return ["collectivite", "sous_categorie"]  # feature names out

def collect_transformer(X):
    X_copy = X.copy()
    mode_df = X_copy.groupby("sous_categorie")["collectivite"].agg(lambda x: x.mode().max()).reset_index()
    mode_df.columns = ["sous_categorie", "mode_collec"]
    for index, row in mode_df.iterrows():
        feat_value = row["sous_categorie"]
        mode_value = row["mode_collec"]
        X_copy.loc[X_copy["sous_categorie"] == feat_value, "collectivite"] = \
            X_copy.loc[X_copy["sous_categorie"] == feat_value, "collectivite"].fillna(value=mode_value)
        X_copy[["collectivite", "sous_categorie"]] = X_copy[["collectivite", "sous_categorie"]]
    return X_copy[["collectivite", "sous_categorie"]]

def collect_processor():
    return make_pipeline(
        FunctionTransformer(collect_transformer, feature_names_out=collect_name),
        )

def haut_name(function_transformer, feature_names_in):
    return ["stadededeveloppement", "hauteurarbre"]  # feature names out

def haut_transformer(X):
    X_copy = X.copy()
    mode_df = X_copy.groupby("stadededeveloppement")["hauteurarbre"].agg(lambda x: x.mode().max()).reset_index()
    mode_df.columns = ["stadededeveloppement", "mode_haut"]
    for index, row in mode_df.iterrows():
        feat_value = row["stadededeveloppement"]
        mode_value = row["mode_haut"]
        X_copy.loc[X_copy["stadededeveloppement"] == feat_value, "hauteurarbre"] = \
            X_copy.loc[X_copy["stadededeveloppement"] == feat_value, "hauteurarbre"].fillna(value=mode_value)
        X_copy[["stadededeveloppement", "hauteurarbre"]] = X_copy[["stadededeveloppement", "hauteurarbre"]]
    return X_copy[["stadededeveloppement", "hauteurarbre"]]

def haut_processor():
    return make_pipeline(
        FunctionTransformer(haut_transformer, feature_names_out=haut_name),
        )

def struc_name(function_transformer, feature_names_in):
    return ["structure", "sous_categorie"]  # feature names out

def struc_transformer(X):
    X_copy = X.copy()
    mode_df = train_feat.groupby("sous_categorie")["structure"].agg(lambda x: x.mode().max()).reset_index()
    mode_df.columns = ["sous_categorie", "mode_struc"]
    for index, row in mode_df.iterrows():
        feat_value = row["sous_categorie"]
        mode_value = row["mode_struc"]
        train_feat.loc[train_feat["sous_categorie"] == feat_value, "structure"] = \
            train_feat.loc[train_feat["sous_categorie"] == feat_value, "structure"].fillna(value=mode_value)
        X_copy[["structure", "sous_categorie"]] = X_copy[["structure", "sous_categorie"]]
    return X_copy[["structure", "sous_categorie"]]

def struc_processor():
    return make_pipeline(
        FunctionTransformer(struc_transformer, feature_names_out=struc_name),
        )

def adr_secteur_name(function_transformer, feature_names_in):
    return ["adr_secteur"]  # feature names out

def transf_adr_secteur(X):
    X_copy = X.copy()
    conditions = [
        (X_copy["adr_secteur"] == 1),
        (X_copy["adr_secteur"] == 2),
        (X_copy["adr_secteur"] == 3),
        (X_copy["adr_secteur"] == 4),
        (X_copy["adr_secteur"] == 5),
        (X_copy["adr_secteur"] == 6),
        (X_copy["adr_secteur"] > 6),
    ]
    categories = [1, 2, 3, 4, 5, 6, 100]
    X_copy["adr_secteur"] = np.select(conditions, categories, default='Unknown')
    X_copy['adr_secteur'] = X_copy['adr_secteur'].apply(str)
    return X_copy[["adr_secteur"]]

adr_secteur_pipeline = make_pipeline(
    FunctionTransformer(transf_adr_secteur, feature_names_out=adr_secteur_name)
)

# Defining the preprocessing steps
preprocessing_initial = ColumnTransformer([
        ("create_lat_long_points", geo_point_processor(), ["geo_point_2d"]),
        ("fill_na_state_dev", stad_dev_processor(), ["stadededeveloppement"]),
        ("fill_na_portarbre", portarbre_processor(), ["portarbre", "adr_secteur"]),
        ("fill_na_raison", raison_processor(), ["raisondeplantation", "sous_categorie"])],
        remainder='passthrough',
        verbose_feature_names_out=False
)

preprocessing_sec = ColumnTransformer([
        ("fill_na_collectivite", collect_processor(), ["collectivite", "sous_categorie"]),
        ("fill_na_haut", haut_processor(), ["hauteurarbre", "stadededeveloppement"])],
        remainder='passthrough',
        verbose_feature_names_out=False
)

preprocessing_thr = ColumnTransformer([
        ("state_dev", ord_stad_pipeline, ["stadededeveloppement"]),
        ("haut_ord", ord_haut_pipeline, ["hauteurarbre"]),
        ("fill_na_struc", struc_processor(), ["structure", "sous_categorie"])],
        remainder='passthrough',
        verbose_feature_names_out=False
)

preprocessing_adr_secteur = ColumnTransformer(
	[("adr_secteur_obj", adr_secteur_pipeline, ['adr_secteur'])],
	remainder='passthrough',
	verbose_feature_names_out=False
)

preprocessing_cat = ColumnTransformer(
    [("cat", cat_pipeline, ["sous_categorie", "raisondeplantation", "collectivite",
                            "portarbre", "structure", "adr_secteur"])],
    remainder='passthrough',
    verbose_feature_names_out=False
)

class DropColumnsTransformer:
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns)
    
    def get_feature_names_out(self, input_features=None):
        return [col for col in input_features if col not in self.columns]

# Define the columns you want to drop
columns_to_drop = ["elem_point_id", "code", "nom", "genre", "genre_desc", "categorie",
                   "categorie_desc", "sous_categorie_desc", "code_parent", "code_parent_desc",
                   "bien_reference", "genre_bota", "espece", "variete", "equipe", "remarques",
                   "courrier", "identifiantplu", "typeimplantationplu", "intituleprotectionplu",
                   "anneeabattage", "essouchement", "diametrearbre", "causeabattage",
                   "stationmetro", "forme", "typenature", "traitementchenilles"]

# Transformer to drop those columns
preprocessing_drop_columns = DropColumnsTransformer(columns=columns_to_drop)

# Combine all transformers in a pipeline
preprocessor = make_pipeline(
    preprocessing_initial,
    preprocessing_sec,
    preprocessing_thr,
    preprocessing_adr_secteur,
    preprocessing_cat,
    preprocessing_drop_columns
)

In [75]:
X_train = preprocessor.fit_transform(train_feat)
X_train.shape

(23804, 29)

In [38]:
X_train.head()

Unnamed: 0,sous_categorie_ESP065,sous_categorie_ESP151,sous_categorie_ESP174,sous_categorie_ESP187,raisondeplantation_Existant,raisondeplantation_Nouveau,raisondeplantation_Remplacement,collectivite_Grenoble Alpes Métropole,collectivite_Ville de Grenoble,portarbre_Architecturé,...,adr_secteur_100,adr_secteur_2,adr_secteur_3,adr_secteur_4,adr_secteur_5,adr_secteur_6,stadededeveloppement,hauteurarbre,lat,lon
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.641925,-0.221898
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.968862,1.223896
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,-1.252971,0.593094
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,-1.30898,-1.573518
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,-0.534798,-1.87438


In [39]:
preprocessor

In [40]:
preprocessor.get_feature_names_out()

['sous_categorie_ESP065',
 'sous_categorie_ESP151',
 'sous_categorie_ESP174',
 'sous_categorie_ESP187',
 'raisondeplantation_Existant',
 'raisondeplantation_Nouveau',
 'raisondeplantation_Remplacement',
 'collectivite_Grenoble Alpes Métropole',
 'collectivite_Ville de Grenoble',
 'portarbre_Architecturé',
 'portarbre_Libre',
 'portarbre_Pleureur',
 'portarbre_Pyramidale',
 'portarbre_Semi-libre',
 'structure_CCAS',
 'structure_Ecole',
 'structure_Métropole de Grenoble',
 'structure_Ville de Grenoble',
 'adr_secteur_1',
 'adr_secteur_100',
 'adr_secteur_2',
 'adr_secteur_3',
 'adr_secteur_4',
 'adr_secteur_5',
 'adr_secteur_6',
 'stadededeveloppement',
 'hauteurarbre',
 'lat',
 'lon']

In [41]:
y_train = train_target
y_train

0        2017.0
1        2015.0
2        1975.0
3        1999.0
4        1994.0
          ...  
23799    2014.0
23800    1978.0
23801    2017.0
23802    2000.0
23803    2015.0
Name: anneedeplantation, Length: 23804, dtype: float64

## DummyRegressor

In [42]:
dummy_reg = DummyRegressor(strategy="median")
dummy_reg.fit(X_train, y_train)

In [43]:
X_test = preprocessor.transform(test_set)
y_pred = dummy_reg.predict(X_test)
y_pred

array([1987., 1987., 1987., ..., 1987., 1987., 1987.])

In [44]:
y_test = test_set["anneedeplantation"].copy().reset_index(drop=True)

dummy_mse_test = mean_squared_error(y_test, y_pred)
print('test dummy MSE: {0}'.format(dummy_mse_test))

test dummy MSE: 335.04200268817203


### Model 1

In [45]:
model_lin = LinearRegression()
model_lin.fit(X_train, y_train)

In [46]:
X_test = preprocessor.transform(test_set)
y_pred = model_lin.predict(X_test)
y_pred

array([1980.21875 , 1996.640625, 1996.09375 , ..., 1986.015625,
       2006.265625, 1993.984375])

In [47]:
mse_test = mean_squared_error(y_test, y_pred)
print('test linear regression MSE: {0}'.format(mse_test))

test linear regression MSE: 3.2627847788017293e+22


### Model 2

In [48]:
model_dtr = DecisionTreeRegressor(random_state=42)
model_dtr.fit(X_train, y_train)

In [49]:
y_pred_dtr = model_dtr.predict(X_test)
mse_dtr_test = mean_squared_error(y_test, y_pred_dtr)
print('test MSE for DTR: {0}'.format(mse_dtr_test))

y_pred_train = model_dtr.predict(X_train)
mse_dtr_train = mean_squared_error(y_train, y_pred_train)
print('training MSE for DTR: {0}'.format(mse_dtr_train))

rmse_dtr_test = np.sqrt(mse_dtr_test)
print('RMSE for DTR: {0}'.format(rmse_dtr_test))

mean  = np.mean(y_train)
std = np.std(y_train)
print('Mean (standard deviation) values of target variables in train set for DTR: {0} ({1})'.format(mean, std))

test MSE for DTR: 104.93245967741936
training MSE for DTR: 0.0
RMSE for DTR: 10.243654605531141
Mean (standard deviation) values of target variables in train set for DTR: 1990.4111493866576 (18.193942341692487)


## Overfitting :)))))

### Model 3

In [50]:
model_svr = SVR()
model_svr.fit(X_train, y_train)

In [51]:
y_pred_svr = model_svr.predict(X_test)
mse_svr_test = mean_squared_error(y_test, y_pred_svr)
print('test MSE for SVR: {0}'.format(mse_svr_test))

y_pred_train = model_svr.predict(X_train)
mse_svr_train = mean_squared_error(y_train, y_pred_train)
print('training MSE for SVR: {0}'.format(mse_svr_train))

rmse_svr_test = np.sqrt(mse_svr_test)
print('RMSE for SVR: {0}'.format(rmse_svr_test))

mean  = np.mean(y_train)
std = np.std(y_train)
print('Mean (standard deviation) values of target variables in train set for SVR: {0} ({1})'.format(mean, std))

test MSE for SVR: 194.3439303478614
training MSE for SVR: 196.00928517722758
RMSE for SVR: 13.940729189962102
Mean (standard deviation) values of target variables in train set for SVR: 1990.4111493866576 (18.193942341692487)


# CV scores

In [52]:
lin_reg_scores = cross_val_score(model_lin, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
lin_reg_scores.mean()

-4.9300441592845386e+19

In [53]:
dtr_scores = cross_val_score(model_dtr, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
-dtr_scores.mean()

116.3223346944826

In [54]:
svr_scores = cross_val_score(model_svr, X_train, y_train, cv=10, n_jobs=-1, scoring="neg_mean_squared_error")
-svr_scores.mean()

198.63733143501162

### Parameter Tuning

In [55]:
param_distribs = {
        'svr__kernel': ['linear', 'rbf'],
        'svr__C': loguniform(0.1, 200),
        'svr__gamma': expon(scale=1.0),
    }

svr_pipeline = Pipeline([("preprocessor", preprocessor), ("svr", SVR())])
svr_rnd_search = RandomizedSearchCV(svr_pipeline,
                                    param_distributions=param_distribs,
                                    n_iter=5, cv=3,
                                    scoring="neg_mean_squared_error",
                                    verbose=2,
                                    n_jobs=-1,
                                    random_state=42)

svr_rnd_search.fit(train_feat, train_target)

Fitting 3 folds for each of 5 candidates, totalling 15 fits




[CV] END svr__C=1.7233288831716693, svr__gamma=3.010121430917521, svr__kernel=linear; total time=  41.5s
[CV] END svr__C=1.7233288831716693, svr__gamma=3.010121430917521, svr__kernel=linear; total time=  42.1s




[CV] END svr__C=1.7233288831716693, svr__gamma=3.010121430917521, svr__kernel=linear; total time=  43.2s




[CV] END svr__C=0.32729742670534195, svr__gamma=0.059838768608680676, svr__kernel=rbf; total time=  54.3s




[CV] END svr__C=0.32729742670534195, svr__gamma=0.059838768608680676, svr__kernel=rbf; total time=  55.6s




[CV] END svr__C=37.47860169570321, svr__gamma=0.9084469696321253, svr__kernel=rbf; total time= 1.0min




[CV] END svr__C=37.47860169570321, svr__gamma=0.9084469696321253, svr__kernel=rbf; total time= 1.0min




[CV] END svr__C=37.47860169570321, svr__gamma=0.9084469696321253, svr__kernel=rbf; total time= 1.1min




[CV] END svr__C=1.2635200422215125, svr__gamma=0.15416196746656105, svr__kernel=linear; total time=  38.8s
[CV] END svr__C=1.2635200422215125, svr__gamma=0.15416196746656105, svr__kernel=linear; total time=  38.3s




[CV] END svr__C=1.2635200422215125, svr__gamma=0.15416196746656105, svr__kernel=linear; total time=  37.1s
[CV] END svr__C=0.32729742670534195, svr__gamma=0.059838768608680676, svr__kernel=rbf; total time=  51.9s




[CV] END svr__C=0.11693648443440109, svr__gamma=3.503557475158312, svr__kernel=rbf; total time=  50.2s
[CV] END svr__C=0.11693648443440109, svr__gamma=3.503557475158312, svr__kernel=rbf; total time=  49.0s
[CV] END svr__C=0.11693648443440109, svr__gamma=3.503557475158312, svr__kernel=rbf; total time=  48.2s


In [56]:
pd.DataFrame(svr_rnd_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__C,param_svr__gamma,param_svr__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,33.19614,0.419909,9.067646,0.276273,1.723329,3.010121,linear,"{'svr__C': 1.7233288831716693, 'svr__gamma': 3...",-234.109822,-229.802,-238.079681,-233.997168,3.380288,3
1,46.112547,0.500569,15.837194,0.538513,37.478602,0.908447,rbf,"{'svr__C': 37.47860169570321, 'svr__gamma': 0....",-163.493529,-165.91993,-162.584215,-163.999225,1.407964,1
2,36.598493,0.42607,17.345434,1.210474,0.327297,0.059839,rbf,"{'svr__C': 0.32729742670534195, 'svr__gamma': ...",-228.248935,-220.775779,-227.561186,-225.528633,3.372484,2
3,30.068841,0.537277,8.019693,0.192634,1.26352,0.154162,linear,"{'svr__C': 1.2635200422215125, 'svr__gamma': 0...",-234.149337,-229.811919,-238.057089,-234.006115,3.3676,4
4,33.107205,0.663113,16.03646,0.217908,0.116936,3.503557,rbf,"{'svr__C': 0.11693648443440109, 'svr__gamma': ...",-261.965845,-253.547051,-258.005129,-257.839342,3.438957,5


In [57]:
svr_rnd_search.best_params_

{'svr__C': 37.47860169570321,
 'svr__gamma': 0.9084469696321253,
 'svr__kernel': 'rbf'}

In [58]:
selector_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectFromModel(DecisionTreeRegressor(random_state=42),
                                 threshold=0.005)),
    ('svr', SVR(C=svr_rnd_search.best_params_["svr__C"],
                gamma=svr_rnd_search.best_params_["svr__gamma"],
                kernel=svr_rnd_search.best_params_["svr__kernel"])),
])

In [60]:
selector_rmses = cross_val_score(selector_pipeline,
                                  train_feat,
                                  train_target,
                                  scoring="neg_mean_squared_error",
                                  cv=5,
                                  n_jobs=-1)
pd.Series(selector_rmses).describe()



count      5.000000
mean    -167.406293
std       11.319892
min     -182.599797
25%     -175.924434
50%     -162.178958
75%     -160.509200
max     -155.819076
dtype: float64

In [78]:
final_model = svr_rnd_search.best_estimator_
final_model

In [79]:
y_test

0       1999.0
1       2012.0
2       2006.0
3       2006.0
4       1995.0
         ...  
5947    1997.0
5948    2004.0
5949    1985.0
5950    2018.0
5951    1999.0
Name: anneedeplantation, Length: 5952, dtype: float64

In [80]:
final_predictions = final_model.predict(test_set)
final_predictions

array([1979.68407091, 2012.12610187, 2006.03478384, ..., 1984.88594328,
       2017.80026014, 1985.9640818 ])

In [65]:
final_y_preds = final_model.predict(test_set)
mse_fin_test = mean_squared_error(y_test, final_y_preds)
print('test MSE for final model: {0}'.format(mse_fin_test))

final_y_train_preds = final_model.predict(train_set)
mse_fin_train = mean_squared_error(y_train, final_y_train_preds)
print('training MSE for final model: {0}'.format(mse_fin_train))

rmse_fin_test = np.sqrt(mse_fin_test)
print('RMSE for final model: {0}'.format(rmse_fin_test))

mean  = np.mean(y_train)
std = np.std(y_train)
print('Mean (standard deviation) values of target variables in train set for final model: {0} ({1})'.format(mean, std))

test MSE for final model: 155.40052929179888
training MSE for final model: 141.6452640825747
RMSE for final model: 12.465974863274788
Mean (standard deviation) values of target variables in train set for final model: 1990.4111493866576 (18.193942341692487)


In [120]:
predictions_for_new_data = final_model.predict(anne_null)
predictions_for_new_data

SyntaxError: invalid syntax (<unknown>, line 1)