<font size="5">Avocado price prediction without and with time windowing</font>

In this example I will compare few regression methods on avocado prices dataset. I will try to prove the advantages of accounting time series windowing in such predictions.

In [None]:
# https://datacarpentry.org/python-ecology-lesson/03-index-slice-subset/

# %config IPCompleter.greedy=True
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def regression_results(y_true_a, y_pred_a):
    
    from sklearn.metrics import r2_score, \
        explained_variance_score, mean_absolute_error, median_absolute_error, mean_squared_log_error, mean_squared_error

    all_positive = ((y_true_a >= 0).all() and (y_pred_a >= 0).all())
    # Regression metrics
    l_explained_variance=explained_variance_score(y_true_a, y_pred_a)
    l_mean_absolute_error=mean_absolute_error(y_true_a, y_pred_a)
    l_mean_squared=mean_squared_error(y_true_a, y_pred_a)
    l_median_absolute_error=median_absolute_error(y_true_a, y_pred_a)
    l_r2=r2_score(y_true_a, y_pred_a)


    print('explained_variance: ', round(l_explained_variance,4))
    print('r2: ', round(l_r2,4))
    print('MAE: ', round(l_mean_absolute_error,4))
    print('MSE: ', round(l_mean_squared,4))
    print('RMSE: ', round(np.sqrt(l_mean_squared),4))
    print('median_absolute_error: ', round(l_median_absolute_error,4))
    if (all_positive):
        l_mean_squared_log_error=mean_squared_log_error(y_true_a, y_pred_a)
        print('mean_squared_log_error: ', round(l_mean_squared_log_error,4))
        
def result_plot(y_test_b, y_pred_b, name):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import matplotlib as matplotlib
    plt.figure(figsize=(40,10))
    plt.plot(y_pred_b, 'ro')
    plt.plot(y_test_b,' go')
    plt.show()
    print(f'\n {name}:')
    regression_results(y_test_b, y_pred_b)
    
    matplotlib.rc('xtick', labelsize=15)
    matplotlib.rc('ytick', labelsize=15)

    fig, ax = plt.subplots(figsize=(10, 10))

    plt.style.use('ggplot')
    plt.plot(y_pred_b, y_test_b, 'ro')
    plt.xlabel('Predictions', fontsize = 15)
    plt.ylabel('Reality', fontsize = 15)
    plt.title('Predictions x Reality on dataset', fontsize = 15)
    ax.plot([y_pred_b.min(), y_pred_b.max()], [y_test_b.min(), y_test_b.max()], 'k--')
    plt.show()
    
def train_and_evaluate_keras_model(model, X_train_c, X_test_c, y_train_c, y_test_c, BATCH_SIZE, EPOCHS):
    from keras import metrics
    import matplotlib as matplotlib
    import matplotlib.pyplot as plt 
    
    print("\n")
    print("Stats for Keras model:")
    history = model.fit(X_train_c, y_train_c, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

    # plot metrics
    plt.plot(history.history['mse'])
    plt.plot(history.history['mae'])

    r2, mse, mae = model.evaluate(X_train_c, y_train_c, verbose=1)

    print(f'Training stats:')
    print(f'mean_squared_error: {mse}')
    print(f'mean_absolute_error: {mae}')

    r2, mse, mae = model.evaluate(X_test_c, y_test_c, verbose=1)
    print(f'Test stats:')
    print(f'mean_squared_error: {mse}')
    print(f'mean_absolute_error: {mae}')
    return model
    
        

First we import csv data of avocado prices using pandas data frame

In [None]:
import pandas as pd
avocado = pd.read_csv("../input/avocado-prices/avocado.csv")

We need to check if any unknown valeus are present in dataset

In [None]:
avocado.head(100)
avocado.isnull().sum()

Formatting for pandas and statistic description

In [None]:
pd.set_option('float_format', '{:f}'.format)
avocado.describe()

**Unnamed: 0** - not useful, we do not know what it is  **Date** - first we are going to see how regression performs without time seres (we only leave year as a reference)

In [None]:
avocado = avocado.drop(['Unnamed: 0', 'Date'], axis = 1)
avocado.info()

In [None]:
avocado.head(100)

Correlate coefficients, non numeric columns are ignored by pandas corr method

In [None]:
f, ax = plt.subplots()
corr = avocado.corr()
print(corr)
sns.heatmap(corr, cmap='coolwarm', annot = True, fmt='.1g')
ax.set_title("Correlation Matrix")
plt.show()

Conclusion - most bags sold are small bags, then large and x-large. They are correlated with total volume in this order. Labels 4046 and 4225 are more correlated with volume then 4770. 4046 are usually in smaller bags and not common in x-large ones.

We can split dataset to features and labels. **AvaregePrice** is what we want to predict with regression. Data needs to be normalized, object type features encoded. We will use LabelEncoder for that

In [None]:
X_1 = avocado.drop(['AveragePrice'], axis = 1).values
y_1 = avocado['AveragePrice'].values

# print(X[:, 10]) for all rows 10th column
print('region', X_1[:, 10] )
#print(X_1[:, 9] )
#print(X_1[:, 8] )

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
X_1[:, 8] = labelencoder_X_1.fit_transform(X_1[:, 8])
labelencoder_X_2 = LabelEncoder()
X_1[:, 9] = labelencoder_X_2.fit_transform(X_1[:, 9])
labelencoder_X_3 = LabelEncoder()
X_1[:, 10] = labelencoder_X_3.fit_transform(X_1[:, 10])
print('region', X_1[:, 10] )
#print(X_1[:, 9] )
#print(X_1[:, 8] )
#print(X_1[10:,:])
print(y_1)

Data needs to be scaled in order to perform well in our ml process. StandardScaler will do all the work for us

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_1)
scaled_features = scaler.transform(X_1)
df_feat = pd.DataFrame(scaled_features)

df_feat.head(100)

Data needs to be split to train and test. Models will be trained on training set and scored on testing one

In [None]:
# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_feat, y_1, test_size = 0.2, random_state = 42, shuffle=True)

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib as matplotlib

linReg = LinearRegression()
linReg.fit(X_train, y_train)
pred_linreg = linReg.predict(X_test)

print(pred_linreg)

In [None]:
result_plot(y_test, pred_linreg, 'LinearRegression')


In [None]:
sns.jointplot(x=pred_linreg, y=y_test, color= 'g')

plt.show()

In [None]:
print('X_train.shape: ', X_train.shape)
print('len(np.unique(y_train)): ', len(np.unique(y_train)))


In [None]:
#https://playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle&regDataset=reg-plane&learningRate=0.03&regularizationRate=0&noise=0&networkShape=3,2&seed=0.40488&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adadelta, Adam

print(X_train[1])
print(y_train)

observation_shape = X_train.shape[1],
print(f'observation_shape: ${observation_shape}')

BATCH_SIZE = 10
EPOCHS = 1000
VALIDATION_SPLIT = 0.1

# Model
keras_model = Sequential()
keras_model.add(Dense(200, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(200, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(200, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(100, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(100, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(50, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(25, kernel_initializer='normal', activation='relu'))
keras_model.add(Dense(1, kernel_initializer='normal', activation='linear'))

# Compile model
# optimizer_adam: EPOCHS = 200 BATCH_SIZE = 10
# explained_variance:  0.7678
# r2:  0.7661
# MAE:  0.1359
# MSE:  0.0376
# RMSE:  0.1939
# median_absolute_error:  0.0938
# mean_squared_log_error:  0.0056
optimizer_adam=Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.995, amsgrad=False)

# optimizer_adadelta: EPOCHS = 200 BATCH_SIZE = 10
# explained_variance:  0.7856
# r2:  0.7855
# MAE:  0.1265
# MSE:  0.0345
# RMSE:  0.1856
# median_absolute_error:  0.0833
# mean_squared_log_error:  0.0052
optimizer_adadelta=Adadelta(learning_rate=1.0, rho=0.90)
keras_model.compile(loss='mean_squared_error', optimizer=optimizer_adadelta, metrics=['mse', 'mae'])

#Train
keras_model = train_and_evaluate_keras_model(keras_model, X_train, X_test, y_train, y_test, BATCH_SIZE, EPOCHS)
pred_keras = keras_model.predict(X_test)

#print(prediction)
# model.evaluate(X_train, y_train)

#print(pred_keras[:100])
#print(y_test[:100])

In [None]:
result_plot(y_test, pred_keras, 'Keras Model')

In [None]:
from sklearn.linear_model import Lasso
import matplotlib as matplotlib
model_lasso = Lasso(alpha=0.0001, fit_intercept=True, tol=0.000001,
          max_iter=100000000, positive=True, warm_start=True)
model_lasso.fit(X_train, y_train)
pred_lasso = model_lasso.predict(X_test)

result_plot(y_test, pred_lasso, 'Lasso')

In [None]:
from sklearn.linear_model import Ridge
import matplotlib as matplotlib
model_ridge = Ridge(alpha=0.0001,normalize=True, max_iter=100000000, solver="auto")
model_ridge.fit(X_train, y_train)
pred_ridge = model_ridge.predict(X_test)

result_plot(y_test, pred_ridge, 'Ridge')

In [None]:
# https://laurenscoster.com/blog/xgboost/
# https://xgboost.readthedocs.io/en/latest/parameter.html
from xgboost import XGBRegressor
model_xgb = XGBRegressor(objective ='reg:squarederror', colsample_bytree=0.3, learning_rate=0.5, max_depth=5, alpha=0.01, n_estimators=10, verbosity=2)
model_xgb.fit(X_train, y_train)
pred_xgb = model_xgb.predict(X_test)

result_plot(y_test, pred_xgb, 'XGBRegressor')

Now we will check if taking into account time series (windowing) will result in better performance of our models.
[Read about time series prediction](https://machinelearningmastery.com/time-series-prediction-with-deep-learning-in-python-with-keras/)
This time we will take into account date. Looking at data price is recorded every week. We will split data per region and week, sort it and create windows for time series.
> For example, given the current time (t) we want to predict the value at the next time in the sequence (t + 1), we can use the current time (t) as well as the two prior times (t-1 and t-2).

> When phrased as a regression problem the input variables are t-2, t-1, t and the output variable is t+1

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import sys
import csv
import os

import pandas as pd
import xgboost as xgb
import numpy as np
from numpy import loadtxt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pickle

def read_file(file_name):
    return pd.read_csv(file_name, sep=',')

def clean_and_fill_data(data):
    # delete noname columns
    data = data.drop(columns='Unnamed: 0')
    # add week columns
    data['weeks'] = pd.to_datetime(data.Date).dt.week
    # data.Date is redundant - delete it
    data = data.drop(columns='Date')
    # Encoding categorical data
    labelEncoder = LabelEncoder()
    data.region = labelEncoder.fit_transform(data.region)
    data.type = labelEncoder.fit_transform(data.type)
    data.region = labelEncoder.fit_transform(data.region)

    # sortuje i zamieniam kolejnoścą kolumny
    new_order_column =  ['year','weeks', 'Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags'
        , 'XLarge Bags', 'type', 'region', 'AveragePrice']
    data = data[new_order_column]
    #Sortowanie ma później nam pomoć jak bedziemy robić okna czasowe.
    data = data.sort_values(['region','type','year','weeks'], ascending=True)
    return data

def split_data_by_region_type(data):
    regions = data.region.unique()
    types = data.type.unique()
    list_data = []

    for r, region in enumerate(regions):
        current_data = data[data.region == region]
        for t, type_l in enumerate(types):
            list_data.append(current_data[current_data.type == type_l])
    return list_data

def prepare_data_for_model(data_splited, window_size):
    model = []
    for data in data_splited:
        np_array = data.to_numpy()
        length = np_array.shape[0]
        
        for index, row in enumerate(np_array):
            if (index + window_size <= length):
                model_row = []
                for i_range in (range(window_size)):
                    current_row = np_array[index + i_range]
                    model_row = [*model_row, *current_row] #nie tworzenie nowych obiektow list przy konkatenacji
                model.append(model_row) #model.append(extra_features, model_row)
            else:
                break
    return model

def save_data(model, file_name):
    print("save")
    with open(file_name, 'w', newline='') as csvFile:
        writer = csv.writer(csvFile)
        for row in model:
            writer.writerow(row)
    csvFile.close()
    pass

In [None]:
import os

data = read_file("../input/avocado-prices/avocado.csv")
data = clean_and_fill_data(data)

# save data
file_name = "01_transform.csv"
data.to_csv("01_transform.csv",index=False)

In [None]:
for_window = 5
data_splited = split_data_by_region_type(data)
for_model = prepare_data_for_model(data_splited, for_window)

save_data(for_model,"02_data_for_model.csv")

In [None]:
file_name = "02_data_for_model.csv" 
dataset = pd.read_csv(file_name, header=None);
dataset = dataset.values #bez headera, tylko wartości 
X_time = dataset[:,0:-1] #: - wszytskie wiersze, 0 - od zera bez ostatniej
Y_time = dataset[:,-1:] # ostatnia kolumna

test_size_time = 0.2
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(X_time, Y_time, test_size=test_size_time)
y_train_time = y_train_time # ewentualnie .flatten() żeby ubewnić się, że mamy płaskie dane [[1],[2]] = [1,2]
y_test_time = y_test_time

model_xgb_window = XGBRegressor(objective ='reg:squarederror', colsample_bytree=0.3, learning_rate=0.5, max_depth=5, alpha=0.01, n_estimators=10, verbosity=2)
model_xgb_window.fit(X_train_time, y_train_time)
y_pred_xgb_widnowing = model_xgb_window.predict(X_test_time)

result_plot(y_test_time, y_pred_xgb_widnowing, 'XGBRegressor with windowing (trend)')

In [None]:
from sklearn.linear_model import Lasso
import matplotlib as matplotlib
model_lasso = Lasso(alpha=0.01, fit_intercept=True, tol=0.00000001,
          max_iter=1000000, positive=True, warm_start=True)
model_lasso.fit(X_train_time, y_train_time)
pred_lasso_time = model_lasso.predict(X_test_time)

result_plot(y_test_time, pred_lasso_time, 'Lasso')



In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib as matplotlib

linReg = LinearRegression()
linReg.fit(X_train_time, y_train_time)
pred_linreg_time = linReg.predict(X_test_time)

result_plot(y_test_time, pred_linreg_time, 'LinearRegression')

Conclusion: Windowing time series enabled us to have very good LinearRegression prediction scores on model with or without l1,l2 regularization and better scores with other regressions. Decision tree regressor (XGB) and keras neural network regression before adding time windowing achieved results comperable to ones after time windowing.