# La Compagnie du Vent Challenge 
1.first try

In [1]:
# pacakge import
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

import datetime
import numpy as np
from datetime import timedelta
import gc
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

In [2]:
def create_df_parc_data(folder_adr,list_num_parc,list_date_parc):
    """
    a function to load (and concatene) informations from Parc_XX.csv
    """
    df_parc_data = pd.DataFrame()
    for num_parc in list_num_parc:
        for date_parc in list_date_parc:
            df_parc_data = df_parc_data.append(pd.read_csv(folder_adr+"//Parc%s_%s.csv"%(num_parc,date_parc),sep=";",decimal=','),ignore_index=True)
    df_parc_data["Date"] = pd.to_datetime(df_parc_data["Date"],format = "%d/%m/%Y %H:%M")
    return df_parc_data

def load_parc_data(folder_adr,list_num_parc,list_date_parc,list_col_parc_data_to_keep):
    """
    keep only fonctionnement == 1 and selection columns
    """
    df_parc_data = create_df_parc_data(folder_adr,list_num_parc,list_date_parc)
    # keep only fonctionnment = 1
    df_parc_data =  df_parc_data[df_parc_data["Fonctionnement"]==1]
    print(df_parc_data.shape)
    return df_parc_data[list_col_parc_data_to_keep]

def add_feature_timestamp(df_parc_data):
    """
    create time features
    """
    df_parc_data["Month"] = df_parc_data["Date"].dt.month
    df_parc_data["Day"] = df_parc_data["Date"].dt.day
    df_parc_data["Hour"] = df_parc_data["Date"].dt.hour
    df_parc_data["Weekday"] = df_parc_data["Date"].dt.weekday
    df_parc_data["Date_hour_int"] = df_parc_data["Date"].dt.year*10**6  + df_parc_data["Date"].dt.month*10**4 +\
    df_parc_data["Date"].dt.day*100 + df_parc_data["Date"].dt.hour
    df_parc_data["nb_hour"] = df_parc_data["Date"].apply(lambda x: np.floor((x- datetime.datetime(2015,1,1)).total_seconds()/3600))
    return 
 
def add_feature_state(df_parc_data):
    """
    create state features
    """
    df_parc_data["State_pause"] = 0  
    df_parc_data.loc[df_parc_data["State"]==2,"State_pause"]=1
    df_parc_data["State_ambiant"] = 0
    df_parc_data.loc[df_parc_data["State"]==999,"State_ambiant"]=1
    df_parc_data["state_pause_ambiant"] = df_parc_data["State_pause"] + df_parc_data["State_ambiant"]
    return

def min_to_hour(df_parc_data):
    """
    convert minute dataframe to hourly dataframe
    """
    df_parc_data_hour = df_parc_data.groupby(["Date_hour_int","Eolienne"]).mean()
    return df_parc_data_hour.reset_index()

def get_eolienne_list(df_parc_data):
    """
    get all eolienne name in the df
    """
    return df_parc_data["Eolienne"].drop_duplicates().tolist()

def create_df_meteo_from_list_grille(folder_adr,list_grille):
    """
    a function to load (and concatenate) informations from PrevMeteo_GrilleXX.xlsx
    """
    df_meteo  = pd.DataFrame()
    for grille_C in list_grille:
        df_meteo_tmp =  pd.read_excel(folder_adr +'/PrevMeteo_Grille%s.xlsx'%(grille_C),sep=';')
        df_meteo_tmp["grille"] = grille_C
        df_meteo = df_meteo.append(df_meteo_tmp,ignore_index=True)
    df_meteo["date"] = pd.to_datetime(df_meteo["date"],format = "%Y-%m-%d %H:%M:%S")
    df_meteo.rename(columns= {"date":"Date"},inplace=True)
    return df_meteo

def meteo_grill_merge(df_meteo,feature_list,join_key):
    """
    merge meteo data by grille
    """
    grille_list = df_meteo["grille"].drop_duplicates().tolist() 
    all_feature_list = feature_list + join_key
    grille_name = grille_list[0]
    index = df_meteo["grille"] == grille_name
    df_meteo_merged = df_meteo.loc[index,all_feature_list]
    df_meteo_merged.columns =  [x +"_"+ str(grille_name) for x in feature_list] + join_key
    if len(grille_list)==1:
        return df_meteo_merged
    for grille_name in grille_list[1:]:
        index = df_meteo["grille"] == grille_name
        df_meteo_merged_p = df_meteo.loc[index,all_feature_list]
        df_meteo_merged_p.columns =  [x +"_"+ str(grille_name) for x in feature_list] + join_key
        df_meteo_merged =  pd.merge(df_meteo_merged,df_meteo_merged_p,on="Date_hour_int",how="left")
    return df_meteo_merged

# define new objective fonction for xgboost
def fair_obj(preds, dtrain):
    """
    fair_obj function to optimize approximatively MAE
    """
    fair_constant = 30
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def mean_absolute_err(y_true, y_pred):
    """
    MAE Metric
    """
    y_true = y_true
    y_pred= y_pred.get_label()
    return "MAE",np.mean(np.abs((y_true - y_pred))) 

def time_zone(df_meteo):
    """
    Aline the 2 time zone in the meteo file and production file
    """
    df_meteo["Date"] = df_meteo["Date"] + timedelta(hours=1)
    df_meteo_new = df_meteo[(df_meteo["fc_hor"]>=24)&(df_meteo["fc_hor"]<=47)]
    del(df_meteo)
    gc.collect()
    return df_meteo_new

def meteo_delta_feature(df_meteo):
    """
    generate meteo delta feature
    """
    df_meteo["vit_100_delta"] =df_meteo["vit_100"] - df_meteo["vit_100"].shift(1)
    df_meteo["vit_10_delta"] =df_meteo["vit_10"] - df_meteo["vit_10"].shift(1)
    df_meteo["dir_100_delta"] =df_meteo["dir_100"] - df_meteo["dir_100"].shift(1)
    df_meteo["dir_10_delta"] =df_meteo["dir_10"] - df_meteo["dir_10"].shift(1)
    return 

def submission_generation(res,save_adr):
    """
    generate submission file
    """
    submission = pd.read_csv("..//data//submit_benchmark.csv",sep=";")
    submission["Date"] = pd.to_datetime(submission["Date"],format = "%Y-%m-%d %H:%M:%S")
    add_feature_timestamp(submission)
    submission =  submission[["Date_hour_int","Eolienne","Date"]]
    submission=pd.merge(submission,res,on=["Date_hour_int","Eolienne"],how="left")[["Date","Eolienne","pred"]]
    submission.to_csv(save_adr,sep=';',header=True,index = False)
    

    

DATA Loading

In [3]:
# parc data 
folder_adr = "..//data"
list_num_parc = [1,2,3]
list_date_parc = ["2015","2016","2017"]
list_col_parc_data_to_keep = ["Date","Eolienne","Production","Fonctionnement","Catégorie","State","Etat","Vent"]
parc_data_min = load_parc_data(folder_adr,list_num_parc,list_date_parc,list_col_parc_data_to_keep)

#forecast data
list_grille= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
##for this example notebook, we will only take one "grille" (=9) 
df_meteo = create_df_meteo_from_list_grille(folder_adr,list_grille)

(12910771, 17)


Feature engineering

In [4]:
# add feature and convert to hourly data
add_feature_timestamp(parc_data_min)
add_feature_state(parc_data_min)
parc_data_hour =  min_to_hour(parc_data_min)
df_meteo = time_zone(df_meteo)
meteo_delta_feature(df_meteo)
add_feature_timestamp(df_meteo)

# we don't need the minute data any more
del(parc_data_min)
gc.collect()

eolienne_list = get_eolienne_list(parc_data_hour)
feature_list = ["vit_100","vit_10",'dir_100','dir_10',"vit_100_delta","vit_10_delta",'dir_100_delta','dir_10_delta',"fc_hor"]
join_key = ["Date_hour_int"]
meteo_grill_merged = meteo_grill_merge(df_meteo,feature_list,join_key)

Data merge

In [5]:
parc_data_hour = pd.merge(parc_data_hour,meteo_grill_merged,on=["Date_hour_int"],how="left")

Train/test validation split 

In [6]:
date_split_test = 2017010100
parc_data_hour_train = parc_data_hour[parc_data_hour["Date_hour_int"]<date_split_test]
parc_data_hour_test = parc_data_hour[parc_data_hour["Date_hour_int"]>=date_split_test]

parc_data_hour_train.dropna(inplace=True)
print(parc_data_hour_train.shape)
print(parc_data_hour_test.shape)

parc_data_hour_train.sort_values("Date_hour_int",inplace=True)
parc_data_hour_train_train = parc_data_hour_train.iloc[:94618,:]
parc_data_hour_train_test  = parc_data_hour_train.iloc[94618:189236,:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


(189214, 158)
(26884, 158)


In [9]:
# feature use in model 
list_col_model_vitesse = ["vit_100_11","vit_10_11",'dir_100_11','dir_10_11',
                  "vit_100_10","vit_10_10",'dir_100_10','dir_10_10',
                  "vit_100_12","vit_10_12",'dir_100_12','dir_10_12',
                  "vit_100_8","vit_10_8",'dir_100_8','dir_10_8',
                  "vit_100_7","vit_10_7",'dir_100_7','dir_10_7',
                  "vit_100_6","vit_10_6",'dir_100_6','dir_10_6',
                  "vit_100_1","vit_10_1",'dir_100_1','dir_10_1',
                  "vit_100_9","vit_10_9",'dir_100_9','dir_10_9',
                  "vit_100_2","vit_10_2",'dir_100_2','dir_10_2',
                  "vit_100_3","vit_10_3",'dir_100_3','dir_10_3',
                  "vit_100_4","vit_10_4",'dir_100_4','dir_10_4',
                  "vit_100_5","vit_10_5",'dir_100_5','dir_10_5',
                  "vit_100_13","vit_10_13",'dir_100_13','dir_10_13',
                  "vit_100_14","vit_10_14",'dir_100_14','dir_10_14',
                  "vit_100_15","vit_10_15",'dir_100_15','dir_10_15',
                  "vit_100_16","vit_10_16",'dir_100_16','dir_10_16'
                 ]
list_col_model_delta_vitesse= ["vit_100_delta_11","vit_10_delta_11",'dir_100_delta_11','dir_10_delta_11',
                               "vit_100_delta_10","vit_10_delta_10",'dir_100_delta_10','dir_10_delta_10',
                               "vit_100_delta_12","vit_10_delta_12",'dir_100_delta_12','dir_10_delta_12',
                               "vit_100_delta_8","vit_10_delta_8",'dir_100_delta_8','dir_10_delta_8',
                               "vit_100_delta_7","vit_10_delta_7",'dir_100_delta_7','dir_10_delta_7',
                               "vit_100_delta_6","vit_10_delta_6",'dir_100_delta_6','dir_10_delta_6',
                               "vit_100_delta_1","vit_10_delta_1",'dir_100_delta_1','dir_10_delta_1',
                               "vit_100_delta_9","vit_10_delta_9",'dir_100_delta_9','dir_10_delta_9',
                               "vit_100_delta_2","vit_10_delta_2",'dir_100_delta_2','dir_10_delta_2',
                               "vit_100_delta_3","vit_10_delta_3",'dir_100_delta_3','dir_10_delta_3',
                               "vit_100_delta_4","vit_10_delta_4",'dir_100_delta_4','dir_10_delta_4',
                               "vit_100_delta_5","vit_10_delta_5",'dir_100_delta_5','dir_10_delta_5',
                               "vit_100_delta_13","vit_10_delta_13",'dir_100_delta_13','dir_10_delta_13',
                               "vit_100_delta_14","vit_10_delta_14",'dir_100_delta_14','dir_10_delta_14',
                               "vit_100_delta_15","vit_10_delta_15",'dir_100_delta_15','dir_10_delta_15',
                               "vit_100_delta_16","vit_10_delta_16",'dir_100_delta_16','dir_10_delta_16'    
]
col_target ="Production"

In [10]:
our_params = {'eta': 0.02,  'subsample': 0.99, 'colsample_bytree': 0.95, 
              'objective': 'reg:linear', 'max_depth':20, 'min_child_weight':20}

In [11]:
list_col_model = list_col_model_vitesse+list_col_model_delta_vitesse

In [None]:
parc_data_hour_train_test["xgb_site"]=np.nan
for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train_train["Eolienne"]==eol_name
    Eolienne_index_test = parc_data_hour_train_test["Eolienne"]==eol_name
    xgdtrain = xgb.DMatrix(parc_data_hour_train_train[Eolienne_index][list_col_model], parc_data_hour_train_train[Eolienne_index][col_target]) 
    xgdtest = xgb.DMatrix(parc_data_hour_train_test[Eolienne_index_test][list_col_model], parc_data_hour_train_test[Eolienne_index_test][col_target])
    evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
    print("mae optimiser ")
    xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 250,verbose_eval=50,obj=fair_obj,feval=mean_absolute_err)
    print("rmse optimiser")
    xgb_model_2 = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 250,verbose_eval=50,feval=mean_absolute_err)
    parc_data_hour_train_test["xgb_site"][Eolienne_index_test]=xgb_model.predict(xgdtest)

In [None]:
parc_data_hour_train_test["xgb_all"]=np.nan    
xgdtrain = xgb.DMatrix(parc_data_hour_train_train[list_col_model], parc_data_hour_train_train[col_target]) 
xgdtest = xgb.DMatrix(parc_data_hour_train_test[list_col_model], parc_data_hour_train_test[col_target])
evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
print("mae optimiser ")
xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 150,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
parc_data_hour_train_test["xgb_all"] = xgb_model.predict(xgdtest)

In [None]:
mean_absolute_error(parc_data_hour_train_test[col_target],(0.3*parc_data_hour_train_test["xgb_all"]+1.7*parc_data_hour_train_test["xgb_site"])/2)

In [None]:
parc_data_hour_test.fc_hor_10.drop_duplicates()

In [14]:
# second validation
res = parc_data_hour_test[["Date_hour_int","Eolienne"]]
res["pred"] =np.nan

for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train["Eolienne"]==eol_name
    xgdmat = xgb.DMatrix(parc_data_hour_train[Eolienne_index][list_col_model], parc_data_hour_train[Eolienne_index][col_target])
    xgb_model = xgb.train(params = our_params ,dtrain = xgdmat, num_boost_round = 250,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
    
    Eolienne_index = parc_data_hour_test["Eolienne"]==eol_name
    res["pred"][Eolienne_index]=xgb_model.predict(xgb.DMatrix(parc_data_hour_test[Eolienne_index][list_col_model]))
save_adr = "..//submission//sub_10_24_1_delta_feature.csv"
submission_generation(res,save_adr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_ob

In [None]:
res = parc_data_hour_test[["Date_hour_int","Eolienne"]]
res["pred"] =np.nan

xgdmat = xgb.DMatrix(parc_data_hour_train[list_col_model], parc_data_hour_train[col_target])
xgb_model = xgb.train(params = our_params ,dtrain = xgdmat, num_boost_round = 150,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
res["pred"]=xgb_model.predict(xgb.DMatrix(parc_data_hour_test[list_col_model]))

save_adr = "..//submission//sub_09_15_all.csv"
submission_generation(res,save_adr)

In [None]:
submission_1 = pd.read_csv("..//submission//sub_09_15_3.csv",sep=";")
submission_2 = pd.read_csv("..//submission//sub_09_15_all.csv",sep=";")

In [None]:
submission_1["pred"] =0.85*submission_1["pred"] + 0.15*submission_2["pred"]

In [None]:
submission_1.to_csv("..//submission//sub_09_15_combine.csv",sep=';',header=True,index = False)

In [6]:
## part II fair_constant tunning

In [13]:
# the defaut value is set to 30
parc_data_hour_train_test["xgb_site"]=np.nan
for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train_train["Eolienne"]==eol_name
    Eolienne_index_test = parc_data_hour_train_test["Eolienne"]==eol_name
    xgdtrain = xgb.DMatrix(parc_data_hour_train_train[Eolienne_index][list_col_model], parc_data_hour_train_train[Eolienne_index][col_target]) 
    xgdtest = xgb.DMatrix(parc_data_hour_train_test[Eolienne_index_test][list_col_model], parc_data_hour_train_test[Eolienne_index_test][col_target])
    evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
    print("mae optimiser ")
    xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 250,verbose_eval=25,obj=fair_obj,feval=mean_absolute_err)
    parc_data_hour_train_test["xgb_site"][Eolienne_index_test]=xgb_model.predict(xgdtest)

mae optimiser 
[0]	eval-rmse:662.786	train-rmse:741.716	eval-MAE:442.22	train-MAE:513.92


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[25]	eval-rmse:380.519	train-rmse:420.479	eval-MAE:250.095	train-MAE:279.487
[50]	eval-rmse:318.301	train-rmse:344.436	eval-MAE:212.465	train-MAE:227.507
[75]	eval-rmse:298.85	train-rmse:318.76	eval-MAE:200.258	train-MAE:208.542
[100]	eval-rmse:291.825	train-rmse:306.667	eval-MAE:196.002	train-MAE:199.371
[125]	eval-rmse:289.012	train-rmse:300.297	eval-MAE:194.335	train-MAE:194.141
[150]	eval-rmse:287.662	train-rmse:296.242	eval-MAE:193.818	train-MAE:190.565
[175]	eval-rmse:286.898	train-rmse:293.53	eval-MAE:193.599	train-MAE:188.15
[200]	eval-rmse:286.552	train-rmse:291.397	eval-MAE:193.547	train-MAE:186.192
[225]	eval-rmse:286.457	train-rmse:289.561	eval-MAE:193.631	train-MAE:184.438
[249]	eval-rmse:286.469	train-rmse:287.899	eval-MAE:193.755	train-MAE:182.842
mae optimiser 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


[0]	eval-rmse:652.837	train-rmse:712.67	eval-MAE:422.411	train-MAE:487.882
[25]	eval-rmse:427.901	train-rmse:456.539	eval-MAE:268.631	train-MAE:296.527
[50]	eval-rmse:367.898	train-rmse:382.878	eval-MAE:234.7	train-MAE:248.289
[75]	eval-rmse:348.475	train-rmse:355.755	eval-MAE:223.306	train-MAE:228.905
[100]	eval-rmse:342.34	train-rmse:344.864	eval-MAE:218.831	train-MAE:219.45
[125]	eval-rmse:340.072	train-rmse:339.466	eval-MAE:217.2	train-MAE:214.079
[150]	eval-rmse:339.091	train-rmse:336.29	eval-MAE:216.517	train-MAE:210.58
[175]	eval-rmse:339.182	train-rmse:334.298	eval-MAE:216.539	train-MAE:208.344
[200]	eval-rmse:339.327	train-rmse:332.64	eval-MAE:216.571	train-MAE:206.417
[225]	eval-rmse:339.325	train-rmse:330.887	eval-MAE:216.551	train-MAE:204.335
[249]	eval-rmse:339.379	train-rmse:329.333	eval-MAE:216.591	train-MAE:202.358
mae optimiser 
[0]	eval-rmse:672.163	train-rmse:729.003	eval-MAE:445.134	train-MAE:507.586
[25]	eval-rmse:439.002	train-rmse:460.331	eval-MAE:279.098	train-M

[150]	eval-rmse:337.578	train-rmse:325.236	eval-MAE:217.796	train-MAE:207.705
[175]	eval-rmse:337.766	train-rmse:322.995	eval-MAE:217.957	train-MAE:205.113
[200]	eval-rmse:338.087	train-rmse:321.299	eval-MAE:218.138	train-MAE:203.15
[225]	eval-rmse:338.306	train-rmse:319.947	eval-MAE:218.234	train-MAE:201.564
[249]	eval-rmse:338.443	train-rmse:318.674	eval-MAE:218.443	train-MAE:200.144


In [12]:
parc_data_hour_train_test["xgb_site"]=np.nan
for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train_train["Eolienne"]==eol_name
    Eolienne_index_test = parc_data_hour_train_test["Eolienne"]==eol_name
    xgdtrain = xgb.DMatrix(parc_data_hour_train_train[Eolienne_index][list_col_model], parc_data_hour_train_train[Eolienne_index][col_target]) 
    xgdtest = xgb.DMatrix(parc_data_hour_train_test[Eolienne_index_test][list_col_model], parc_data_hour_train_test[Eolienne_index_test][col_target])
    evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
    print("mae optimiser ")
    xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 250,verbose_eval=25,obj=fair_obj,feval=mean_absolute_err)
    parc_data_hour_train_test["xgb_site"][Eolienne_index_test]=xgb_model.predict(xgdtest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


mae optimiser 
[0]	eval-rmse:660.275	train-rmse:738.831	eval-MAE:441.005	train-MAE:512.232
[25]	eval-rmse:377.218	train-rmse:418.875	eval-MAE:247.487	train-MAE:278.123
[50]	eval-rmse:311.68	train-rmse:339.142	eval-MAE:208.943	train-MAE:224.837
[75]	eval-rmse:289.236	train-rmse:309.243	eval-MAE:194.67	train-MAE:202.479
[100]	eval-rmse:281.431	train-rmse:295.967	eval-MAE:189.811	train-MAE:192.057
[125]	eval-rmse:278.297	train-rmse:288.889	eval-MAE:187.722	train-MAE:185.939
[150]	eval-rmse:276.839	train-rmse:284.362	eval-MAE:187.005	train-MAE:181.761
[175]	eval-rmse:276.025	train-rmse:281.077	eval-MAE:186.528	train-MAE:178.652
[200]	eval-rmse:275.585	train-rmse:278.449	eval-MAE:186.422	train-MAE:176.293
[225]	eval-rmse:275.089	train-rmse:275.835	eval-MAE:186.142	train-MAE:173.783
[249]	eval-rmse:274.815	train-rmse:273.784	eval-MAE:185.935	train-MAE:171.802


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


mae optimiser 
[0]	eval-rmse:653.021	train-rmse:712.697	eval-MAE:422.721	train-MAE:487.955
[25]	eval-rmse:425.193	train-rmse:455.24	eval-MAE:265.842	train-MAE:294.253
[50]	eval-rmse:364.321	train-rmse:380.306	eval-MAE:231.551	train-MAE:244.649
[75]	eval-rmse:344.218	train-rmse:352.228	eval-MAE:219.659	train-MAE:224.591
[100]	eval-rmse:337.002	train-rmse:340.115	eval-MAE:214.762	train-MAE:214.49
[125]	eval-rmse:333.999	train-rmse:333.768	eval-MAE:212.695	train-MAE:208.454
[150]	eval-rmse:333.095	train-rmse:330.254	eval-MAE:211.928	train-MAE:204.731
[175]	eval-rmse:332.884	train-rmse:328.182	eval-MAE:211.648	train-MAE:202.381
[200]	eval-rmse:332.745	train-rmse:326.233	eval-MAE:211.567	train-MAE:200.016
[225]	eval-rmse:332.595	train-rmse:324.496	eval-MAE:211.642	train-MAE:197.893
[249]	eval-rmse:332.561	train-rmse:322.507	eval-MAE:211.6	train-MAE:195.673
mae optimiser 
[0]	eval-rmse:673.162	train-rmse:730.22	eval-MAE:445.609	train-MAE:508.114
[25]	eval-rmse:438.814	train-rmse:461.797	eval

[150]	eval-rmse:332.324	train-rmse:318.477	eval-MAE:213.194	train-MAE:200.477
[175]	eval-rmse:332.533	train-rmse:316.259	eval-MAE:213.295	train-MAE:197.972
[200]	eval-rmse:332.784	train-rmse:314.442	eval-MAE:213.436	train-MAE:195.995
[225]	eval-rmse:332.893	train-rmse:312.768	eval-MAE:213.546	train-MAE:194.178
[249]	eval-rmse:332.884	train-rmse:310.897	eval-MAE:213.679	train-MAE:192.227


In [14]:
mean_absolute_error(parc_data_hour_train_test[col_target],parc_data_hour_train_test["xgb_site"])

189.54900928834138

In [17]:
def fair_obj(preds, dtrain):
    """
    fair_obj function to optimize approximatively MAE
    """
    fair_constant = 35
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

In [19]:
# the defaut value is set to 30
parc_data_hour_train_test["xgb_site_35"]=np.nan
for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train_train["Eolienne"]==eol_name
    Eolienne_index_test = parc_data_hour_train_test["Eolienne"]==eol_name
    xgdtrain = xgb.DMatrix(parc_data_hour_train_train[Eolienne_index][list_col_model], parc_data_hour_train_train[Eolienne_index][col_target]) 
    xgdtest = xgb.DMatrix(parc_data_hour_train_test[Eolienne_index_test][list_col_model], parc_data_hour_train_test[Eolienne_index_test][col_target])
    evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
    print("mae optimiser ")
    xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 200,verbose_eval=25,obj=fair_obj,feval=mean_absolute_err)
    parc_data_hour_train_test["xgb_site_28"][Eolienne_index_test]=xgb_model.predict(xgdtest)

mae optimiser 
[0]	eval-rmse:658.68	train-rmse:736.918	eval-MAE:440.364	train-MAE:511.558


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[25]	eval-rmse:374.292	train-rmse:414.169	eval-MAE:247.004	train-MAE:275.794
[50]	eval-rmse:315.156	train-rmse:341.409	eval-MAE:210.281	train-MAE:224.589
[75]	eval-rmse:296.515	train-rmse:315.649	eval-MAE:198.611	train-MAE:205.555
[100]	eval-rmse:289.955	train-rmse:303.787	eval-MAE:194.945	train-MAE:196.47
[125]	eval-rmse:287.446	train-rmse:297.125	eval-MAE:193.583	train-MAE:191.008
[150]	eval-rmse:286.444	train-rmse:292.865	eval-MAE:193.152	train-MAE:187.358
[175]	eval-rmse:285.886	train-rmse:289.685	eval-MAE:192.963	train-MAE:184.583
[199]	eval-rmse:285.55	train-rmse:287.196	eval-MAE:192.87	train-MAE:182.414
mae optimiser 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


[0]	eval-rmse:650.364	train-rmse:709.788	eval-MAE:421.163	train-MAE:486.239
[25]	eval-rmse:422.65	train-rmse:450.756	eval-MAE:266.268	train-MAE:293.34
[50]	eval-rmse:364.605	train-rmse:378.75	eval-MAE:232.682	train-MAE:245.162
[75]	eval-rmse:346.365	train-rmse:352.898	eval-MAE:221.216	train-MAE:226.049
[100]	eval-rmse:340.76	train-rmse:342.51	eval-MAE:217.42	train-MAE:216.776
[125]	eval-rmse:339.044	train-rmse:337.137	eval-MAE:216.3	train-MAE:211.328
[150]	eval-rmse:338.52	train-rmse:333.641	eval-MAE:216.059	train-MAE:207.64
[175]	eval-rmse:338.422	train-rmse:331.305	eval-MAE:215.991	train-MAE:205.065
[199]	eval-rmse:338.557	train-rmse:329.66	eval-MAE:216.161	train-MAE:203.205
mae optimiser 
[0]	eval-rmse:670.496	train-rmse:727.064	eval-MAE:444.132	train-MAE:506.363
[25]	eval-rmse:434.277	train-rmse:453.749	eval-MAE:276.537	train-MAE:299.039
[50]	eval-rmse:375.165	train-rmse:379.242	eval-MAE:239.472	train-MAE:246.165
[75]	eval-rmse:357.452	train-rmse:354.351	eval-MAE:228.164	train-MAE:

In [22]:
mean_absolute_error(parc_data_hour_train_test[col_target],parc_data_hour_train_test["xgb_site_28"])

189.03162432428846

In [23]:
# second validation
res = parc_data_hour_test[["Date_hour_int","Eolienne"]]
res["pred"] =np.nan

for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train["Eolienne"]==eol_name
    xgdmat = xgb.DMatrix(parc_data_hour_train[Eolienne_index][list_col_model], parc_data_hour_train[Eolienne_index][col_target])
    xgb_model = xgb.train(params = our_params ,dtrain = xgdmat, num_boost_round = 200,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
    
    Eolienne_index = parc_data_hour_test["Eolienne"]==eol_name
    res["pred"][Eolienne_index]=xgb_model.predict(xgb.DMatrix(parc_data_hour_test[Eolienne_index][list_col_model]))
save_adr = "..//submission//sub_09_24_1_delta_feature_35_200.csv"
submission_generation(res,save_adr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_ob