### IMPORT

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
pd.set_option("display.max_columns",100)
from datetime import datetime

### LOAD DATASET

In [2]:
### a function to load (and concatene) informations from Parc_XX.csv
def create_df_parc_data(list_num_parc,list_date_parc):
    df_parc_data = pd.DataFrame()
    for num_parc in list_num_parc:
        for date_parc in list_date_parc:
            df_parc_data = df_parc_data.append(pd.read_csv("../data/Parc%s_%s.csv"%(num_parc,date_parc),sep=";",decimal=','),ignore_index=True)
    df_parc_data["Date"] = pd.to_datetime(df_parc_data["Date"], format = "%d/%m/%Y %H:%M")
    return df_parc_data

In [3]:
list_num_parc = [1,2,3]
list_date_parc = ["2015","2016","2017"]

In [4]:
df_parc_data = create_df_parc_data(list_num_parc,list_date_parc)

##### we create the Production_mean_hour variable : mean production per hour

In [5]:
### we create an ident for each hour "Date_hour_int"
df_parc_data["Date_hour_int"] = df_parc_data["Date"].dt.year*10**6  + df_parc_data["Date"].dt.month*10**4 +\
df_parc_data["Date"].dt.day*10**2 + df_parc_data["Date"].dt.hour
#### we create a dataframe with "production_mean_hour" value for each Eolienne*date_hour_int
df_product_mean = df_parc_data[df_parc_data["Fonctionnement"]==1].groupby(["Eolienne","Date_hour_int"])["Production"]\
                            .mean().reset_index().rename(columns={"Production":"Production_mean_hour"})

##### we add this value in the initial dataset "df_parc_data"
df_parc_data = pd.merge(df_parc_data,df_product_mean,on=["Eolienne","Date_hour_int"],how="left")

####  WEATHER's DATASET from prevMeteo

In [6]:
### a function to load (and concatene) informations from PrevMeteo_GrilleXX.xlsx
def create_df_meteo_from_list_grille(list_grille):
    df_meteo  = pd.DataFrame()
    for grille_C in list_grille:
        df_meteo_tmp =  pd.read_excel('../data/PrevMeteo_Grille%s.xlsx'%(grille_C),sep=';')
        df_meteo_tmp["grille"] = grille_C
        df_meteo = df_meteo.append(df_meteo_tmp,ignore_index=True)
    df_meteo["date"] = pd.to_datetime(df_meteo["date"],format = "%Y-%m-%d %H:%M:%S")
    df_meteo.rename(columns= {"date":"Date"},inplace=True)
    return df_meteo

In [7]:
# for weather's data,we can take the value of 16 different "grille" 
# some more data are available but with a very shalow history and will not be used for that model
list_grille = range(1,17)
df_meteo = create_df_meteo_from_list_grille(list_grille)

In [8]:
df_meteo.columns

Index(['Date', 'fc_hor', 'RS', 'CAPE', 'SP', 'CP', 'BLD', 'SSHF', 'SLHF',
       'MSL', 'BLH', 'TCC', 'U10', 'V10', 'T2', 'D2', 'SSRD', 'STRD', 'SSR',
       'STR', 'TSR', 'LCC', 'MCC', 'HCC', 'TSRC', 'SSRC', 'STRC', 'TP', 'FA',
       'U100', 'V100', 'vit_100', 'vit_10', 'dir_100', 'dir_10', 'grille'],
      dtype='object')

In [9]:
### the aim of this challenge is to predict the Production for tomorrow. 
# so, we keep only forecast (fc_hor) between 24h and 47h (Tomorrow's Forecast)
df_meteo = df_meteo[(df_meteo["fc_hor"]>=24) & (df_meteo["fc_hor"]<=47)].copy()

### we create df_all (concatenation of train and test)
it's the dataset for the model.

In [17]:
### we keep only one row per hour (minute =0)
### we keep only one row where the wind turbine is working (Fonctionnement =1)
### we keep only usefull columns
list_col_parc_data_to_keep = ["Date","Eolienne","Fonctionnement","Production_mean_hour"]
df_all = df_parc_data[(df_parc_data["Date"].dt.minute == 0) & (df_parc_data["Fonctionnement"] == 1)]\
[list_col_parc_data_to_keep].copy()

In [18]:
print (df_all.shape)
df_all.head(3)

(215158, 4)


Unnamed: 0,Date,Eolienne,Fonctionnement,Production_mean_hour
0,2015-01-01 00:00:00,Turb3,1,121.923333
60,2015-01-01 01:00:00,Turb3,1,210.081667
120,2015-01-01 02:00:00,Turb3,1,255.016667


#### we add weather's variable

In [12]:
#df_all.set_index('Date', inplace=True)

In [19]:
df_all = pd.merge(df_all, df_meteo, on='Date')

In [20]:
df_all.head(2)

Unnamed: 0,Date,Eolienne,Fonctionnement,Production_mean_hour,fc_hor,RS,CAPE,SP,CP,BLD,SSHF,SLHF,MSL,BLH,TCC,U10,V10,T2,D2,SSRD,STRD,SSR,STR,TSR,LCC,MCC,HCC,TSRC,SSRC,STRC,TP,FA,U100,V100,vit_100,vit_10,dir_100,dir_10,grille
0,2015-01-02,Turb3,1,789.3785,24,2739200.0,0.0,101224.6875,0.0,11.850252,46.509167,-2.275556,102914.3125,433.048706,1.0,4.680779,5.947447,3.927148,1.40542,0.0,285.716675,0.066597,-40.097778,0.0,0.176422,0.951355,1.0,0.0,0.0,-51.831112,0.0,0.156924,8.737968,9.557012,12.949462,7.568475,222.436646,218.203568,1
1,2015-01-02,Turb3,1,789.3785,24,2761600.0,0.0,101517.1875,0.0,10.392474,46.0825,-2.417778,102930.4375,414.642456,1.0,4.049919,5.698912,3.630762,0.892236,0.0,281.876678,0.066597,-42.444443,0.0,0.147736,0.869507,1.0,0.0,0.0,-52.115555,0.0,0.140978,7.971367,9.275762,12.230391,6.991383,220.674973,215.399414,2


#### we keep only value after 2015_1_3 to avoid missing values

In [22]:
df_all = df_all[df_all['Date'] > datetime(2015,1,3)].copy()
df_all.sort_values('Date', ascending=1, inplace=True)

### MACHINE LEARNING


#### we create the train/val split
The objective is now to create a dataset to train a model and a dataset to evaluate our model.

In [23]:
date_split_val = datetime(2016,1,1)
date_split_test = datetime(2017,1,1)

In [25]:
df_all_train = df_all[df_all['Date'] < date_split_val]
df_all_val = df_all[(df_all['Date'] >= date_split_val) & (df_all['Date'] < date_split_test)]

#### for the moment, we keep all variables

In [34]:
list_col_model = list(df_all_train.columns[5:])
col_target = "Production_mean_hour"

#### for this notebook, we will use a linear model

In [None]:
#from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import Lasso
#from sklearn.linear_model import ElasticNet
# check : from sklearn.linear_model import ElasticNetCV

In [None]:
#model = LinearRegression()
#model = Lasso(alpha=0.1, copy_X=False, normalize=True, max_iter=10000, precompute=True)
#model = ElasticNet(alpha = 0.1, l1_ratio=1, normalize=True, precompute=True, max_iter=10000, copy_X=False, tol=0.0001)

#### Fit one model by turbine on the train dataset (2015)
A lasso is performed first in order to perform an effective feature selection

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_regression

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

turbines = ['Turb1', 'Turb2', 'Turb3', 'Turb4', 'Turb5', 'Turb6', 'Turb7', 'Turb8', 'Turb9', 'Turb10', 'Turb11']
# turbines = ['Turb1']

# create pipeline dictionnary with key is turbine and value is a a dictionnary of models
models = {turbine :  
              {grid : Pipeline([('Lasso', Lasso(alpha=0.1, copy_X=False, normalize=True, max_iter=1000))]) for grid in list_grille} 
          for turbine in turbines}


In [35]:
# train models
for turbine, full_grid in models.items():
    for grid, model in full_grid.items():
        train = df_all_train[(df_all_train["Eolienne"] == turbine) & (df_all_train["grille"] == grid)]
        model.fit(train[list_col_model], train[col_target])

In [53]:
print(models['Turb1'][0].steps[0][1].sparse_coef_)

  (0, 0)	3.29512745268e-06
  (0, 1)	0.0829210052541
  (0, 4)	15.548374094
  (0, 5)	0.0782235463827
  (0, 7)	0.00619424507003
  (0, 8)	0.173629355961
  (0, 9)	-80.6499168342
  (0, 10)	0.233977501351
  (0, 13)	-6.13744236483
  (0, 18)	-0.0666283973745
  (0, 19)	-5.27913207904
  (0, 23)	-0.14005908756
  (0, 28)	-5.54358815751
  (0, 29)	65.6724343431
  (0, 30)	75.1066491187
  (0, 32)	-0.0997552826882


In [72]:
# predict
list_col_pred_val = list(set(["Date", "Fonctionnement", "Eolienne", col_target] + list_col_model))
col_pred = ["pred" + str(i) for i in range(1,17)]

In [75]:
#prediction has to be between 0 and 2000
df_pred_val = df_all_val.loc[df_all_val["grille"] == 1, ["Date", "Eolienne", col_target] + col_pred].copy() 

for turbine, full_grid in models.items():
    print("Processing turbine: " + turbine)
    for grid, model in full_grid.items():
        df_all_val_chunk = df_all_val.loc[(df_all_val["Eolienne"] == turbine) & (df_all_val["grille"] == grid), list_col_pred_val]
        pred = list(map(lambda x : max(min(x, 2000), 0), models[turbine][grid].predict(df_all_val_chunk[list_col_model])))
        df_pred_val.loc[df_pred_val["Eolienne"] == turbine, ["pred" + str(grid)]] = pred

Processing turbine: Turb1
Processing turbine: Turb2
Processing turbine: Turb10
Processing turbine: Turb9
Processing turbine: Turb8
Processing turbine: Turb3
Processing turbine: Turb4
Processing turbine: Turb6
Processing turbine: Turb11
Processing turbine: Turb5
Processing turbine: Turb7


In [77]:
df_pred_val.head()

Unnamed: 0,Date,Eolienne,Production_mean_hour,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14,pred15,pred16
2592512,2016-01-01,Turb7,389.1995,309.686575,336.09239,369.096522,414.791516,309.315444,334.475622,364.050056,401.245802,292.597004,335.157724,376.961963,413.208024,288.792239,325.423222,365.847304,401.840672
2592528,2016-01-01,Turb8,432.57,336.564677,361.530871,394.430507,439.50474,336.905985,359.712924,389.049338,430.007314,319.982892,364.945156,407.105418,443.881771,313.518677,353.814072,396.259104,431.950516
2592480,2016-01-01,Turb5,467.763833,345.456277,371.327223,404.390416,449.853625,343.604738,367.499378,397.050934,436.498841,318.238827,365.090388,410.615462,450.189407,312.792606,352.180834,394.396052,434.715373
2592544,2016-01-01,Turb1,587.974,466.084997,488.889537,521.620466,566.975011,463.293526,482.973778,511.563319,554.176552,431.956213,484.232679,530.80182,571.719633,425.685811,469.397448,515.163031,559.132011
2592496,2016-01-01,Turb6,394.706167,326.413487,352.175229,385.186196,430.209832,325.303035,349.419467,378.981023,416.567246,304.397233,349.185369,392.994612,428.427899,301.282599,341.40814,382.35643,417.017145


In [78]:
from sklearn.metrics import mean_absolute_error
import numpy as np

In [88]:
#Benchmark is 259.4
#get best pred for each model
best_grid = [(turbine, np.argmin([mean_absolute_error(df_pred_val.loc[df_pred_val["Eolienne"] == turbine, [col_target]], df_pred_val.loc[df_pred_val["Eolienne"] == turbine, [pred]]) for pred in col_pred])) for turbine in turbines]

In [105]:
#refactor: work on several small files rather than one big
for (turbine, grid) in best_grid:
    df_pred_val.loc[df_pred_val["Eolienne"] == turbine, ["pred"]] = df_pred_val.loc[df_pred_val["Eolienne"] == turbine, ["pred"+str(grid)]].values

##### preprocessing

In [None]:
#could be helpful to normalize data before if variable multiplication are used in the model
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)

In [None]:
#Model should be rebuilt to treat each grid file separatly and then blending step will agregate all models together
#log transform (or other) should be tested (hyper-parameter)

#reprocess import phase and slice data by turbine / grid
#then construct dictionnary of pipeline
#try a PCA in the pipeline
#ensemble : see how to select/blend models
#try XGBoost


X_train = poly.fit_transform(df_all_train.loc[df_all_train["Eolienne"] == "Turb1", list_col_model])

In [None]:
X_train[0:2,:]

In [None]:
X_test = poly.fit_transform(df_all_val.loc[df_all_val["Eolienne"] == "Turb1", list_col_model])

#### we evaluate our prediction

In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np

In [107]:
#Benchmark is 259.4
mean_absolute_error(df_pred_val[col_target],df_pred_val["pred"])

209.22220184065762

In [None]:
df_pred_val[["Eolienne", col_target, "pred"]]

In [None]:
eolienne_C = "Turb2"
df_pred_val[df_pred_val["Eolienne"] == eolienne_C][["Production_mean_hour", "pred"]].iloc[0:300].plot(figsize=(16,4))

#### we predict on the test

In [108]:
df_all_test = df_all[(df_all["Date"]>=date_split_test)].copy()

In [109]:
list_col_pred_test = list(set(["Date","Eolienne"]+list_col_model))
df_pred_test = df_all_test[list_col_pred_test].copy()

In [110]:
# TODO : here predict for test file based to learned model
df_pred_test["pred"] = model.predict(df_all_test[list_col_model])

##### we create the submit_file

In [111]:
df_pred_test[["Date","Eolienne","pred"]].to_csv("../data/submit_benchmark.csv",sep=";",index=False)

In [112]:
pd.read_csv("../data/submit_benchmark.csv",sep=";",nrows=5)

Unnamed: 0,Date,Eolienne,pred
0,2017-01-01 00:00:00,Turb8,178.861753
1,2017-01-01 00:00:00,Turb8,93.445708
2,2017-01-01 00:00:00,Turb8,82.749913
3,2017-01-01 00:00:00,Turb8,65.066844
4,2017-01-01 00:00:00,Turb8,76.702268
