Predicting values 30 (n_future) days from a given time by looking at 60 (n_past) days of past data:

In [None]:
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np

#mpl.rcParams['figure.figsize'] = (10, 8)
mpl.rcParams['axes.grid'] = False
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  

data_dir = '../input/acea-water-prediction'
fname_Auser = os.path.join(data_dir, 'Aquifer_Auser.csv')
fname_Doganella = os.path.join(data_dir, 'Aquifer_Doganella.csv')
fname_Luco = os.path.join(data_dir, 'Aquifer_Luco.csv')
fname_Petrignano = os.path.join(data_dir, 'Aquifer_Petrignano.csv')
fname_Bilancino = os.path.join(data_dir, 'Lake_Bilancino.csv')
fname_Arno = os.path.join(data_dir, 'River_Arno.csv')
fname_Amiata = os.path.join(data_dir, 'Water_Spring_Amiata.csv')
fname_Lupa = os.path.join(data_dir, 'Water_Spring_Lupa.csv')
fname_Madonna_di_Canneto = os.path.join(data_dir, 'Water_Spring_Madonna_di_Canneto.csv')

# reading files
def file_read(fname):
    df = pd.read_csv(fname, parse_dates={'date': ['Date']},
                  date_parser=lambda x: pd.to_datetime(x, format="%d/%m/%Y"))
    return df


In [None]:
df_Auser=file_read(fname_Auser)
df_Doganella=file_read(fname_Doganella)
df_Luco=file_read(fname_Luco)
df_Petrignano=file_read(fname_Petrignano)
df_Bilancino=file_read(fname_Bilancino)
df_Arno=file_read(fname_Arno)
df_Lupa=file_read(fname_Lupa)
df_Madonna_di_Canneto=file_read(fname_Madonna_di_Canneto)
df_Amiata=file_read(fname_Amiata)


label_cols_Auser=['Depth_to_Groundwater_SAL','Depth_to_Groundwater_CoS','Depth_to_Groundwater_LT2']
label_cols_Doganella=['Depth_to_Groundwater_Pozzo_1','Depth_to_Groundwater_Pozzo_2','Depth_to_Groundwater_Pozzo_3','Depth_to_Groundwater_Pozzo_4','Depth_to_Groundwater_Pozzo_5','Depth_to_Groundwater_Pozzo_6','Depth_to_Groundwater_Pozzo_7','Depth_to_Groundwater_Pozzo_8','Depth_to_Groundwater_Pozzo_9']
label_cols_Petrignano=['Depth_to_Groundwater_P24','Depth_to_Groundwater_P25']
label_cols_Luco =['Depth_to_Groundwater_Podere_Casetta']

label_cols_Madonna_di_Canneto=['Flow_Rate']
label_cols_Lupa=['Flow_Rate']
label_cols_Amiata=['Flow_Rate_Bugnano','Flow_Rate_Arbure',' Flow_Rate_Ermicciolo','Flow_Rate_Galleria_Alta']

label_cols_Bilancino=['Lake_Level','Flow_Rate']

label_cols_Arno=['Hydrometry_Nave_di_Rosano']


In [None]:
def df_prep_xgb(df_waterbody,label_cols,max_lag=90,cols_to_drop=None):
    df=df_waterbody.copy()
    #drop the required columns in df
    if cols_to_drop is not None:
            df=df.drop(cols_to_drop, axis = 1)

    #filtering the largest chunk of dataset with no missing values of target columns 
    #after interpolation of 14 days
    df= df.interpolate(method ='linear', limit_direction ='both', limit = 14)
    df['mv_count']=df[label_cols].isnull().sum(axis=1)
    df['slice_index']=0
    j=1
    for i in range(len(df)):
        if df.loc[i,'mv_count']==0:
            df.loc[i,'slice_index']=j
        else: 
                 j=j+1
    my_list=[]
    for val, cnt in df['slice_index'].value_counts().iteritems():
        if (cnt>=365) & (val!=0):
                my_list.append(val)

    boolean_series = df.slice_index.isin(my_list)
    df = df[boolean_series]
    maxSliceNum=df['slice_index'].value_counts()[:1].index.tolist()[0]
    df=df[df['slice_index']==maxSliceNum]
    df=df.drop(['mv_count','slice_index'], axis = 1)
    #make 'Date' the index of df
    df.set_index("date", inplace = True)  
    #count number of zeroes in each column
    num_zeroes={}
    for var in df.columns: 
        num_zeroes[var]=len(df)-(df[var].astype(bool).sum())
    #add features with lag 
    for var in df.columns:    
           for t in range(1, max_lag+1):
                    df[var+'_lag'+str(t)] = df[var].shift(t)
    #add time series features 
    df['Date']=df.index
    df['month']=df['Date'].dt.month 
    df['day']=df['Date'].dt.day
    df['Week_Number'] = df['Date'].dt.isocalendar().week
    df['Week_Number']=df['Week_Number'].astype(float)
    #df['Week_Number']=pd.Series(np.array(df['Week_Number'], dtype=int))
    df['Week_Number'] = pd.to_numeric(df['Week_Number'],downcast='integer',errors='ignore')
    df['dayofweek_name']=df['Date'].dt.day_name()
    # Add a new column named 'weekend' 
    df['weekend']=[1 if ((x =='Saturday')| (x=='Sunday')) else 0 for x in df['dayofweek_name']] 
    df=df.drop(['dayofweek_name','Date'], axis = 1)                
    #making the n target variables the first n columns in the dataframe
    df_label_cols=df[label_cols]
    df_label_cols_not=df.drop(label_cols, axis = 1) 
    df=df_label_cols.join(df_label_cols_not)
    df.drop(df.head(max_lag).index, inplace=True)

    return df,num_zeroes

In [None]:
df_prep_Bilancino,num_zeroes_Bilancino=df_prep_xgb(df_Bilancino,label_cols_Bilancino,max_lag=60)

In [None]:
df_prep_Bilancino.shape
#num_zeroes_Bilancino

In [None]:
def X_y_train_test(ts_waterbody,feature_columns,label_columns,n_future):
            df=ts_waterbody.copy()
            label_cols=label_columns.copy()
            feature_cols=feature_columns.copy()
            label_cols.append('date')
            feature_cols.append('date')
            df['date']=df.index
            df=df.reset_index(drop=True)
            X=df.head(len(df)-n_future)
            y=df.tail(len(df)-n_future)
            X=X.reindex(columns=feature_cols)
            y=y.reindex(columns=label_cols)
            #indices_X=[range(0,(len(df)-n_future))]
            #indices_y=[range(n_future,len(df))]
            #X=df.loc[df.index.intersection(indices_X),feature_cols]
            #y=df.loc[df.index.intersection(indices_y),label_cols]
            X.set_index("date", inplace = True) 
            y.set_index("date", inplace = True)             
            n_train=int(0.8*len(X))
            n_test=len(X)-n_train
            X_train=X.head(n_train)
            y_train=y.head(n_train)
            X_test=X.tail(n_test)
            y_test=y.tail(n_test)

            return X_train,y_train,X_test,y_test
        


In [None]:
X_train_Bilancino,y_train_Bilancino,X_test_Bilancino,y_test_Bilancino=X_y_train_test(df_prep_Bilancino,df_prep_Bilancino.columns.to_list(),label_cols_Bilancino,30)

In [None]:
print("shape of X_train_Bilancino: {}".format(X_train_Bilancino.shape))
print("shape of y_train_Bilancino: {}".format(y_train_Bilancino.shape))
print("shape of X_test_Bilancino: {}".format(X_test_Bilancino.shape))
print("shape of y_test_Bilancino: {}".format(y_test_Bilancino.shape))

In [None]:
def mean_absolute_percentage_error_2(y_true, y_pred): 
    my_list=[]
    for i in range(len(y_true)):

        if (y_true[i]==0) & (y_pred[i]==0):
                #print("y_true[i] in loop if 1:",y_true[i])
                #print("y_pred[i] in loop if 1:",y_pred[i])
                my_list.append(0)
        if (y_true[i]==0) & (y_pred[i]!=0):
                #print("y_true[i] in loop if 2:",y_true[i])
                #print("y_pred[i] in loop if 2:",y_pred[i])            
                y_true[i]=0.00001
                my_list.append((y_true[i] - y_pred[i]) / y_true[i])
        if (y_true[i]!=0) & (y_pred[i]!=0):
                #print("y_true[i] in loop if 3:",y_true[i])
                #print("y_pred[i] in loop if 3:",y_pred[i])  
                my_list.append((y_true[i] - y_pred[i]) / y_true[i])
        if (y_true[i]!=0) & (y_pred[i]==0):
                #print("y_true[i] in loop if 3:",y_true[i])
                #print("y_pred[i] in loop if 3:",y_pred[i])  
                my_list.append((y_true[i] - y_pred[i]) / y_true[i])
    return np.mean(np.abs(np.array(my_list))) * 100
            

In [None]:
from sklearn.datasets import make_regression
from xgboost import XGBRegressor
#from sklearn.metrics import mean_absolute_percentage_error
from sklearn.feature_selection import SelectFromModel
from numpy import sort

def feature_selection(X_train_waterboby,y_train_waterboby,X_test_waterboby,y_test_waterboby):
    imp_features={}
    X_train=X_train_waterboby.copy()
    y_train=y_train_waterboby.copy()
    X_test=X_test_waterboby.copy()
    y_test=y_test_waterboby.copy()
    for i in range(len(y_train.columns)):
        model = XGBRegressor()
        # fit the model
        model.fit(X_train, y_train.loc[:,y_train.columns[i]])
        # get importance
        #importance = model.feature_importances_
        results=pd.DataFrame()
        results['columns']=X_train.columns
        
        #results['columns']=X_train.columns
        results['importances'] = model.feature_importances_
        
        results.sort_values(by='importances',ascending=True,inplace=True)
        results=results.reset_index(drop=True)
        results['mape']=0
        results['threshold']=0
        #results=results[:num_feature]

        thresholds = sort(model.feature_importances_)
        j=0
        for thresh in thresholds:
         # select features using threshold
                selection = SelectFromModel(model, threshold=thresh, prefit=True)
                select_X_train = selection.transform(X_train)
         # train model
                selection_model = XGBRegressor()
                selection_model.fit(select_X_train, y_train.loc[:,y_train.columns[i]])
         # eval model
                select_X_test = selection.transform(X_test)
                y_pred = selection_model.predict(select_X_test)
                mape = mean_absolute_percentage_error_2(np.array(y_test.loc[:,y_test.columns[i]]), np.array(y_pred))
                
                results.loc[j,'mape']=mape
                results.loc[j,'threshold']=thresh
                j=j+1
        results=results.tail(len(results)-(results['mape'].idxmin(axis = 0)))
        imp_features[y_train.columns[i]]=results
     
    return imp_features


In [None]:
imp_features_Bilancino=feature_selection(X_train_Bilancino,y_train_Bilancino,X_test_Bilancino,y_test_Bilancino)

In [None]:
def feature_lag_table(my_dict,target):
    my_dict[target]['if_lag']=my_dict[target]['columns'].apply(lambda x : 1 if x.find('lag') != -1 else 0 )
    my_dict[target]['lag_num']=my_dict[target][my_dict[target]['if_lag']==1]['columns'].apply(lambda x : x.split('_lag')[1])
    my_dict[target]['lag_num']=my_dict[target]['lag_num'].fillna(0)
    my_dict[target]['features']=my_dict[target]['columns']
    my_dict[target]['features']=my_dict[target]['features'].apply(lambda x : x.split('_lag')[0] if x.find('lag') != -1 else x )
    df_1=my_dict[target].loc[:,['features','lag_num']]
    df_2=df_1.groupby('features').apply(lambda x: x['lag_num'].unique())
    df_3=pd.DataFrame()
    df_3['features']=df_2.index 
    df_3['lag']=df_2.values
    return df_3

In [None]:
df_i={}
for i in range(len(label_cols_Bilancino)):
            df_i[label_cols_Bilancino[i]]=feature_lag_table(imp_features_Bilancino,label_cols_Bilancino[i])
            file_name_csv=label_cols_Bilancino[i]+"_features.csv"
            df_i[label_cols_Bilancino[i]].to_csv(file_name_csv,index=False) 

        


In [None]:
label_cols_Bilancino[1]    

In [None]:
df_i[label_cols_Bilancino[1]]


In [None]:
label_cols_Bilancino[0]

In [None]:
df_i[label_cols_Bilancino[0]]

In [None]:
def predict_test(df_prep_waterbody,imp_feature_dict,target_columns,n_future,figsize=(14,10)):
    number_of_subplots=len(target_columns)
    fig, axes = plt.subplots(nrows=number_of_subplots, ncols=1, figsize=figsize)
    plt.subplots_adjust(hspace = 0.8) 
    for i in range(len(target_columns)):
            df_features=imp_feature_dict[target_columns[i]]
            selected_columns=df_features['columns'].to_list()
            out_col=[target_columns[i]]
            X_train,y_train,X_test,y_test=X_y_train_test(df_prep_waterbody,selected_columns,out_col,n_future)
            datelist_y_test=y_test.index
            print("shape of X_train: {}".format(X_train.shape))
            print("shape of y_train: {}".format(y_train.shape))
            print("shape of X_test: {}".format(X_test.shape))
            print("shape of y_test: {}".format(y_test.shape))
            model = XGBRegressor()
            model.fit(X_train, y_train)
            # eval model
            y_pred = model.predict(X_test)
            mape = mean_absolute_percentage_error_2(np.array(y_test), np.array(y_pred))
            axes[i].xaxis.set_tick_params(rotation=45)
            axes[i].plot(datelist_y_test, y_test, 'b')
            axes[i].plot(datelist_y_test, y_pred, 'r')
            axes[i].legend(['actual', 'prediction'], loc='upper left')
            axes[i].set_title(target_columns[i]+"  MAPE="+str(mape))   
            
    return axes

In [None]:
predict_test(df_prep_Bilancino,imp_features_Bilancino,label_cols_Bilancino,30,figsize=(14,8))
plt.savefig('predict_test_Bilancino.jpeg')

In [None]:
def feature_importance_plot(imp_feature_dict,target_columns,num_feature,figsize=(14,10)):
    number_of_subplots=len(target_columns)
    fig, axes = plt.subplots(nrows=number_of_subplots, ncols=1, figsize=figsize)
    plt.subplots_adjust(hspace = 1.2) 
    
    for i in range(len(target_columns)):
            df_features=imp_feature_dict[target_columns[i]]
            selected_columns=df_features['columns'].to_list()
            
            feat_imp=df_features['importances'].to_list()
            axes[i].xaxis.set_tick_params(rotation=20)
            axes[i].bar(selected_columns[-num_feature:], feat_imp[-num_feature:], color ='maroon',width=0.2)
            axes[i].set_title(target_columns[i])   
            
    return axes

In [None]:
feature_importance_plot(imp_features_Bilancino,label_cols_Bilancino,10,figsize=(14,8))
plt.savefig('feature_importance_Bilancino.jpeg')