# La Compagnie du Vent Challenge 
1.first try

In [1]:
# pacakge import
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import numpy as np
from datetime import timedelta
import gc
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

In [2]:
def create_df_parc_data(folder_adr,list_num_parc,list_date_parc):
    """
    a function to load (and concatene) informations from Parc_XX.csv
    """
    df_parc_data = pd.DataFrame()
    for num_parc in list_num_parc:
        for date_parc in list_date_parc:
            df_parc_data = df_parc_data.append(pd.read_csv(folder_adr+"//Parc%s_%s.csv"%(num_parc,date_parc),sep=";",decimal=','),ignore_index=True)
    df_parc_data["Date"] = pd.to_datetime(df_parc_data["Date"],format = "%d/%m/%Y %H:%M")
    return df_parc_data

def load_parc_data(folder_adr,list_num_parc,list_date_parc,list_col_parc_data_to_keep):
    """
    keep only fonctionnement == 1 and selection columns
    """
    df_parc_data = create_df_parc_data(folder_adr,list_num_parc,list_date_parc)
    # keep only fonctionnment = 1
    df_parc_data =  df_parc_data[df_parc_data["Fonctionnement"]==1]
    print(df_parc_data.shape)
    return df_parc_data[list_col_parc_data_to_keep]

def add_feature_timestamp(df_parc_data):
    """
    create time features
    """
    df_parc_data["Month"] = df_parc_data["Date"].dt.month
    df_parc_data["Day"] = df_parc_data["Date"].dt.day
    df_parc_data["Hour"] = df_parc_data["Date"].dt.hour
    df_parc_data["Weekday"] = df_parc_data["Date"].dt.weekday
    df_parc_data["Date_hour_int"] = df_parc_data["Date"].dt.year*10**6  + df_parc_data["Date"].dt.month*10**4 +\
    df_parc_data["Date"].dt.day*100 + df_parc_data["Date"].dt.hour
    df_parc_data["nb_hour"] = df_parc_data["Date"].apply(lambda x: np.floor((x- datetime.datetime(2015,1,1)).total_seconds()/3600))
    return 
 
def add_feature_state(df_parc_data):
    """
    create state features
    """
    df_parc_data["State_pause"] = 0  
    df_parc_data.loc[df_parc_data["State"]==2,"State_pause"]=1
    df_parc_data["State_ambiant"] = 0
    df_parc_data.loc[df_parc_data["State"]==999,"State_ambiant"]=1
    df_parc_data["state_pause_ambiant"] = df_parc_data["State_pause"] + df_parc_data["State_ambiant"]
    return

def min_to_hour(df_parc_data):
    """
    convert minute dataframe to hourly dataframe
    """
    df_parc_data_hour = df_parc_data.groupby(["Date_hour_int","Eolienne"]).mean()
    return df_parc_data_hour.reset_index()

def get_eolienne_list(df_parc_data):
    """
    get all eolienne name in the df
    """
    return df_parc_data["Eolienne"].drop_duplicates().tolist()

def create_df_meteo_from_list_grille(folder_adr,list_grille):
    """
    a function to load (and concatenate) informations from PrevMeteo_GrilleXX.xlsx
    """
    df_meteo  = pd.DataFrame()
    for grille_C in list_grille:
        df_meteo_tmp =  pd.read_excel(folder_adr +'/PrevMeteo_Grille%s.xlsx'%(grille_C),sep=';')
        df_meteo_tmp["grille"] = grille_C
        df_meteo = df_meteo.append(df_meteo_tmp,ignore_index=True)
    df_meteo["date"] = pd.to_datetime(df_meteo["date"],format = "%Y-%m-%d %H:%M:%S")
    df_meteo.rename(columns= {"date":"Date"},inplace=True)
    return df_meteo

def meteo_grill_merge(df_meteo,feature_list,join_key):
    """
    merge meteo data by grille
    """
    grille_list = df_meteo["grille"].drop_duplicates().tolist() 
    all_feature_list = feature_list + join_key
    grille_name = grille_list[0]
    index = df_meteo["grille"] == grille_name
    df_meteo_merged = df_meteo.loc[index,all_feature_list]
    df_meteo_merged.columns =  [x +"_"+ str(grille_name) for x in feature_list] + join_key
    if len(grille_list)==1:
        return df_meteo_merged
    for grille_name in grille_list[1:]:
        index = df_meteo["grille"] == grille_name
        df_meteo_merged_p = df_meteo.loc[index,all_feature_list]
        df_meteo_merged_p.columns =  [x +"_"+ str(grille_name) for x in feature_list] + join_key
        df_meteo_merged =  pd.merge(df_meteo_merged,df_meteo_merged_p,on="Date_hour_int",how="left")
    return df_meteo_merged

# define new objective fonction for xgboost
def fair_obj(preds, dtrain):
    """
    fair_obj function to optimize approximatively MAE
    """
    fair_constant = 30
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def mean_absolute_err(y_true, y_pred):
    """
    MAE Metric
    """
    y_true = y_true
    y_pred= y_pred.get_label()
    return "MAE",np.mean(np.abs((y_true - y_pred))) 

def time_zone(df_meteo):
    """
    Aline the 2 time zone in the meteo file and production file
    """
    df_meteo["Date"] = df_meteo["Date"] + timedelta(hours=1)
    df_meteo_new = df_meteo[(df_meteo["fc_hor"]>=23)&(df_meteo["fc_hor"]<=46)]
    del(df_meteo)
    gc.collect()
    return df_meteo_new

def meteo_delta_feature(df_meteo):
    """
    generate meteo delta feature
    """
    df_meteo["vit_100_delta"] =df_meteo["vit_100"] - df_meteo["vit_100"].shift(1)
    df_meteo["vit_10_delta"] =df_meteo["vit_10"] - df_meteo["vit_10"].shift(1)
    df_meteo["dir_100_delta"] =df_meteo["dir_100"] - df_meteo["dir_100"].shift(1)
    df_meteo["dir_10_delta"] =df_meteo["dir_10"] - df_meteo["dir_10"].shift(1)
    return 

def submission_generation(res,save_adr):
    """
    generate submission file
    """
    submission = pd.read_csv("..//data//submit_benchmark.csv",sep=";")
    submission["Date"] = pd.to_datetime(submission["Date"],format = "%Y-%m-%d %H:%M:%S")
    add_feature_timestamp(submission)
    submission =  submission[["Date_hour_int","Eolienne","Date"]]
    submission=pd.merge(submission,res,on=["Date_hour_int","Eolienne"],how="left")[["Date","Eolienne","pred"]]
    submission.to_csv(save_adr,sep=';',header=True,index = False)
    

    

DATA Loading

In [3]:
# parc data 
folder_adr = "..//data"
list_num_parc = [1,2,3]
list_date_parc = ["2015","2016","2017"]
list_col_parc_data_to_keep = ["Date","Eolienne","Production","Fonctionnement","Catégorie","State","Etat","Vent"]
parc_data_min = load_parc_data(folder_adr,list_num_parc,list_date_parc,list_col_parc_data_to_keep)

#forecast data
list_grille= [6,7,8,10,11,12]
##for this example notebook, we will only take one "grille" (=9) 
df_meteo = create_df_meteo_from_list_grille(folder_adr,list_grille)

(12910771, 17)


Feature engineering

In [4]:
# add feature and convert to hourly data
add_feature_timestamp(parc_data_min)
add_feature_state(parc_data_min)
parc_data_hour =  min_to_hour(parc_data_min)
df_meteo = time_zone(df_meteo)
meteo_delta_feature(df_meteo)
add_feature_timestamp(df_meteo)

# we don't need the minute data any more
del(parc_data_min)
gc.collect()

eolienne_list = get_eolienne_list(parc_data_hour)
feature_list = ["vit_100","vit_10",'dir_100','dir_10',"vit_100_delta","vit_10_delta",'dir_100_delta','dir_10_delta',"fc_hor"]
join_key = ["Date_hour_int"]
meteo_grill_merged = meteo_grill_merge(df_meteo,feature_list,join_key)

Data merge

In [5]:
parc_data_hour = pd.merge(parc_data_hour,meteo_grill_merged,on=["Date_hour_int"],how="left")

Train/test validation split 

In [7]:
date_split_test = 2017010100
parc_data_hour_train = parc_data_hour[parc_data_hour["Date_hour_int"]<date_split_test]
parc_data_hour_test = parc_data_hour[parc_data_hour["Date_hour_int"]>=date_split_test]

parc_data_hour_train.dropna(inplace=True)
print(parc_data_hour_train.shape)
print(parc_data_hour_test.shape)

parc_data_hour_train.sort_values("Date_hour_int",inplace=True)
parc_data_hour_train_train = parc_data_hour_train.iloc[:94618,:]
parc_data_hour_train_test  = parc_data_hour_train.iloc[94618:189236,:]

(189225, 68)
(26884, 68)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
# feature use in model 
list_col_model = ["vit_100_11","vit_10_11",'dir_100_11','dir_10_11',"vit_100_delta_11","vit_10_delta_11",'dir_100_delta_11','dir_10_delta_11',
                  "vit_100_10","vit_10_10",'dir_100_10','dir_10_10',"vit_100_delta_10","vit_10_delta_10",'dir_100_delta_10','dir_10_delta_10',
                  "vit_100_12","vit_10_12",'dir_100_12','dir_10_12',"vit_100_delta_12","vit_10_delta_12",'dir_100_delta_12','dir_10_delta_12',
                  "vit_100_8","vit_10_8",'dir_100_8','dir_10_8',"vit_100_delta_8","vit_10_delta_8",'dir_100_delta_8','dir_10_delta_8',
                  "vit_100_7","vit_10_7",'dir_100_7','dir_10_7',"vit_100_delta_7","vit_10_delta_7",'dir_100_delta_7','dir_10_delta_7',
                  "vit_100_6","vit_10_6",'dir_100_6','dir_10_6',"vit_100_delta_6","vit_10_delta_6",'dir_100_delta_6','dir_10_delta_6'
                 ]
col_target ="Production"

In [17]:
our_params = {'eta': 0.02,  'subsample': 0.99, 'colsample_bytree': 0.95, 
              'objective': 'reg:linear', 'max_depth':20, 'min_child_weight':20}

In [44]:
parc_data_hour_train_test["xgb_site"]=np.nan
for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train_train["Eolienne"]==eol_name
    Eolienne_index_test = parc_data_hour_train_test["Eolienne"]==eol_name
    xgdtrain = xgb.DMatrix(parc_data_hour_train_train[Eolienne_index][list_col_model], parc_data_hour_train_train[Eolienne_index][col_target]) 
    xgdtest = xgb.DMatrix(parc_data_hour_train_test[Eolienne_index_test][list_col_model], parc_data_hour_train_test[Eolienne_index_test][col_target])
    evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
    print("mae optimiser ")
    xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 250,verbose_eval=50,obj=fair_obj,feval=mean_absolute_err)
    #print("rmse optimiser")
    #xgb_model_2 = xgb.train(params = our_params ,dtrain = xgdmat, evals=evallist, num_boost_round = 250,verbose_eval=250,feval=mean_absolute_err)
    parc_data_hour_train_test["xgb_site"][Eolienne_index_test]=xgb_model.predict(xgdtest)

mae optimiser 
[0]	eval-rmse:660.722	train-rmse:739.176	eval-MAE:441.427	train-MAE:512.791


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


[50]	eval-rmse:311.277	train-rmse:338.502	eval-MAE:208.608	train-MAE:224.607
[100]	eval-rmse:281.471	train-rmse:295.967	eval-MAE:189.51	train-MAE:192.869
[150]	eval-rmse:277.404	train-rmse:285.155	eval-MAE:187.194	train-MAE:183.477
[200]	eval-rmse:276.055	train-rmse:279.527	eval-MAE:186.513	train-MAE:178.443
[250]	eval-rmse:275.556	train-rmse:275.53	eval-MAE:186.023	train-MAE:174.625
[299]	eval-rmse:275.314	train-rmse:271.477	eval-MAE:185.759	train-MAE:170.378


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


mae optimiser 
[0]	eval-rmse:654.239	train-rmse:713.269	eval-MAE:424.028	train-MAE:488.541
[50]	eval-rmse:366.017	train-rmse:381.788	eval-MAE:232.549	train-MAE:246.823
[100]	eval-rmse:338.308	train-rmse:342.637	eval-MAE:215.587	train-MAE:217.946
[150]	eval-rmse:335.092	train-rmse:333.78	eval-MAE:212.731	train-MAE:208.99
[200]	eval-rmse:334.828	train-rmse:330.316	eval-MAE:212.361	train-MAE:204.499
[250]	eval-rmse:334.6	train-rmse:327.539	eval-MAE:212.256	train-MAE:200.809
[299]	eval-rmse:334.293	train-rmse:324.854	eval-MAE:212.32	train-MAE:197.92
mae optimiser 
[0]	eval-rmse:673.057	train-rmse:730.027	eval-MAE:445.671	train-MAE:508.096
[50]	eval-rmse:377.412	train-rmse:384.196	eval-MAE:239.995	train-MAE:249.876
[100]	eval-rmse:349.534	train-rmse:344.207	eval-MAE:223.705	train-MAE:219.559
[150]	eval-rmse:345.73	train-rmse:334.874	eval-MAE:221.391	train-MAE:210.213
[200]	eval-rmse:345.116	train-rmse:330.57	eval-MAE:221.018	train-MAE:205.144
[250]	eval-rmse:344.557	train-rmse:327.662	eval-

In [37]:
parc_data_hour_train_test["xgb_all"]=np.nan    
xgdtrain = xgb.DMatrix(parc_data_hour_train_train[list_col_model], parc_data_hour_train_train[col_target]) 
xgdtest = xgb.DMatrix(parc_data_hour_train_test[list_col_model], parc_data_hour_train_test[col_target])
evallist  = [(xgdtest,'eval'), (xgdtrain,'train')]
print("mae optimiser ")
xgb_model = xgb.train(params = our_params ,dtrain = xgdtrain, evals=evallist, num_boost_round = 150,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
parc_data_hour_train_test["xgb_all"] = xgb_model.predict(xgdtest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


mae optimiser 
[0]	eval-rmse:557.861	train-rmse:622.014	eval-MAE:362.725	train-MAE:423.529
[10]	eval-rmse:392.018	train-rmse:422.162	eval-MAE:254.072	train-MAE:278.433
[20]	eval-rmse:345.21	train-rmse:358.369	eval-MAE:222.067	train-MAE:227.508
[30]	eval-rmse:322.623	train-rmse:323.34	eval-MAE:206.786	train-MAE:198.949
[40]	eval-rmse:309.835	train-rmse:300.468	eval-MAE:198.621	train-MAE:180.689
[50]	eval-rmse:302.614	train-rmse:284.37	eval-MAE:194.372	train-MAE:167.972
[60]	eval-rmse:298.287	train-rmse:272.518	eval-MAE:192.065	train-MAE:158.72
[70]	eval-rmse:295.573	train-rmse:263.614	eval-MAE:190.857	train-MAE:151.774
[80]	eval-rmse:293.701	train-rmse:256.259	eval-MAE:190.16	train-MAE:146.174
[90]	eval-rmse:292.582	train-rmse:250.436	eval-MAE:189.857	train-MAE:141.823
[100]	eval-rmse:291.853	train-rmse:245.673	eval-MAE:189.798	train-MAE:138.34
[110]	eval-rmse:291.241	train-rmse:242.064	eval-MAE:189.73	train-MAE:135.679
[120]	eval-rmse:290.906	train-rmse:238.918	eval-MAE:189.801	train-M

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [45]:
mean_absolute_error(parc_data_hour_train_test[col_target],(0.3*parc_data_hour_train_test["xgb_all"]+1.7*parc_data_hour_train_test["xgb_site"])/2)

182.94499119380373

In [49]:
parc_data_hour_test.fc_hor_10.drop_duplicates()

189500    23.0
189508    24.0
189519    25.0
189530    26.0
189541    27.0
189552    28.0
189563    29.0
189574    30.0
189585    31.0
189596    32.0
189607    33.0
189618    34.0
189629    35.0
189640    36.0
189651    37.0
189662    38.0
189673    39.0
189684    40.0
189695    41.0
189706    42.0
189717    43.0
189728    44.0
189739    45.0
189750    46.0
Name: fc_hor_10, dtype: float64

In [50]:
# second validation
res = parc_data_hour_test[["Date_hour_int","Eolienne"]]
res["pred"] =np.nan

for eol_name in eolienne_list:
    Eolienne_index = parc_data_hour_train["Eolienne"]==eol_name
    xgdmat = xgb.DMatrix(parc_data_hour_train[Eolienne_index][list_col_model], parc_data_hour_train[Eolienne_index][col_target])
    xgb_model = xgb.train(params = our_params ,dtrain = xgdmat, num_boost_round = 250,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
    
    Eolienne_index = parc_data_hour_test["Eolienne"]==eol_name
    res["pred"][Eolienne_index]=xgb_model.predict(xgb.DMatrix(parc_data_hour_test[Eolienne_index][list_col_model]))
save_adr = "..//submission//sub_09_15_3.csv"
submission_generation(res,save_adr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_ob

In [51]:
res = parc_data_hour_test[["Date_hour_int","Eolienne"]]
res["pred"] =np.nan

xgdmat = xgb.DMatrix(parc_data_hour_train[list_col_model], parc_data_hour_train[col_target])
xgb_model = xgb.train(params = our_params ,dtrain = xgdmat, num_boost_round = 150,verbose_eval=10,obj=fair_obj,feval=mean_absolute_err)
res["pred"]=xgb_model.predict(xgb.DMatrix(parc_data_hour_test[list_col_model]))

save_adr = "..//submission//sub_09_15_all.csv"
submission_generation(res,save_adr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [52]:
submission_1 = pd.read_csv("..//submission//sub_09_15_3.csv",sep=";")
submission_2 = pd.read_csv("..//submission//sub_09_15_all.csv",sep=";")

In [54]:
submission_1["pred"] =0.85*submission_1["pred"] + 0.15*submission_2["pred"]

In [55]:
submission_1.to_csv("..//submission//sub_09_15_combine.csv",sep=';',header=True,index = False)