In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
sns.set(style="darkgrid")

%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/avocado-prices/avocado.csv')
df

## Columns of interest

- Date - The date of the observation
- AveragePrice - the average price of a single avocado
- type - conventional or organic
- year - the year
- Region - the city or region of the observation
- Total Volume - Total number of avocados sold
- 4046 - Total number of avocados with PLU 4046 sold
- 4225 - Total number of avocados with PLU 4225 sold
- 4770 - Total number of avocados with PLU 4770 sold

In [None]:
df.drop(columns=['Unnamed: 0','Total Bags','Small Bags','Large Bags','XLarge Bags'],inplace=True)

In [None]:
df.rename(columns={'4046':'PLU_4046','4225':'PLU_4225','4770':'PLU_4770'},inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.Date.nunique()

In [None]:
df.describe()

# Analysis

In [None]:
# Total avocada of each type
pd.value_counts(df['type'])

#### Avocado sold per type

In [None]:
avocado_type = df.groupby(['type'])['Total Volume'].sum()
print(avocado_type)
avocado_type.plot.bar();

#### Avocado prices per type in each year

In [None]:
price = df.groupby(['type','year'])['AveragePrice'].mean().reset_index()

In [None]:
plt.figure(figsize=(10,7));
ax = sns.barplot(x="year", y="AveragePrice", hue="type", data=price)

#### Avocado sold in each region

In [None]:
region = df.groupby(['region'])['Total Volume'].sum().reset_index()
top_region = region.nlargest(10,'Total Volume')
plt.figure(figsize=(12,5));
ax = sns.barplot(x="region", y="Total Volume", data=top_region,color="b")

In [None]:
total_us = df[df.region=='TotalUS'].index

In [None]:
df.drop(total_us,inplace=True)

In [None]:
region = df.groupby(['region'])['Total Volume'].sum().reset_index()
top_region = region.nlargest(10,'Total Volume')
plt.figure(figsize=(12,5));
ax = sns.barplot(x="region", y="Total Volume", data=top_region,color="b")

#### Avocado records per year

In [None]:
# Avocado records per each year
plt.figure(figsize=(8,5));
lt = df.year.value_counts().sort_index(ascending=True).plot.bar()
lt.set(title = 'Avocado records Per Year')
lt.set_xlabel('Year', fontsize = '13')
lt.set_ylabel('Count', fontsize = '13')
plt.show()

#### Avocado sold per year

In [None]:
sold_per_year = df.groupby(['year'])['Total Volume'].sum().reset_index()

fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True);
fig.suptitle('Avocado sold per year');

sns.barplot(x="year", y="Total Volume", ax= axes[0] ,data=sold_per_year,color="b");
sns.lineplot(x=sold_per_year['year'],y=sold_per_year['Total Volume'], ax= axes[1]);

#### Average price of Avocado per year

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(x=df['year'],y=df['AveragePrice'])
plt.xlabel('Avg price')
plt.ylabel('years')
plt.show()
#price_per_year = df.groupby(['year'])['AveragePrice'].mean().reset_index()

# Time Series
A time series is a sequence of numerical data points in successive order.

### Average price weekly

In [None]:
df_date = df.groupby(['Date'])['AveragePrice'].mean()
df_date = df_date.to_frame(name='AveragePrice')
# Price
plt.figure(figsize=(12,5))
df_date['AveragePrice'].plot();

let's check yearly now

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 10), sharex=False);
fig.suptitle('Avocado sold per year');

df_2015 = df_date.loc['2015']
sns.lineplot(x=df_2015.index.month,y=df_2015['AveragePrice'], ax= axes[0]);

df_2016 = df_date.loc['2016']
sns.lineplot(x=df_2016.index.month,y=df_2016['AveragePrice'], ax= axes[1]);

df_2017 = df_date.loc['2017']
sns.lineplot(x=df_2017.index.month,y=df_2017['AveragePrice'], ax= axes[2]);

df_2018 = df_date.loc['2018']
sns.lineplot(x=df_2018.index,y=df_2018['AveragePrice'], ax= axes[3]);

In [None]:
df_date_sold = df.groupby(['Date'])['Total Volume'].mean()
df_date_sold = df_date_sold.to_frame(name='Total Volume')
plt.figure(figsize=(12,5))
df_date_sold['Total Volume'].plot();

# Stationary time series
stationarity means that the statistical properties of a process generating a time series do not change over time.
- Constant mean
- Constant Standard deviation
- No seasonality (a regularly repeating pattern of highs and lows related to calendar time)

### Organic Avocados

In [None]:
df_date_organic = df[(df.type=="organic")].groupby(['Date'])['AveragePrice'].mean()
df_date_organic = df_date_organic.to_frame(name='AveragePrice')
# Price
plt.figure(figsize=(12,5))
df_date_organic['AveragePrice'].plot();

### Unit root tests
Tests for presence of unit root (Non stationary)

### Augmented Dickey Fuller 
We are testing as our null hypothesis that our time-series is actually non-stationary

- Null Hypothesis (H0): It is non-stationary. It has some time dependent structure.
- Alternate Hypothesis (H1): It is stationary. It does not have time-dependent structure.
- p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
- p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary. 

In [None]:
from statsmodels.tsa.stattools import adfuller

X = df_date_organic['AveragePrice'].values
result = adfuller(X, autolag='AIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
if(result[1] > 0.05):
    print("series is non-stationary")
else:
    print("series is stationary")

### Kwiatkowski-Phillips-Schmidt-Shin (KPSS)
The null and alternate hypothesis for the KPSS test are opposite that of the ADF test.

- Null Hypothesis (H0): series is stationary.
- Alternate Hypothesis (H1): series is non-stationary.
- p-value > 0.05: Fail to reject the null hypothesis (H0), and series is stationary.
- p-value <= 0.05: Reject the null hypothesis (H0), series is non-stationary. 

In [None]:
# KPSS test
from statsmodels.tsa.stattools import kpss

X = df_date_organic['AveragePrice'].values
statistic, p_value, n_lags, critical_values = kpss(X, nlags='auto')
print('ADF Statistic: %f' % statistic)
print('p-value: %f' % p_value)
print('num lags: %f' % n_lags)
print('Critical Values:')
for key, value in critical_values.items():
    print('\t%s: %.3f' % (key, value))

if(p_value > 0.05):
    print("series is stationary")
else:
    print("series is not stationary")

In [None]:
train_size = int(len(df_date_organic) * 0.75)
train_organic, test_organic = df_date_organic[0:train_size], df_date_organic[train_size:]

In [None]:
train_organic

# Arima

- AR: Autoregression
- I: Integrated
- MA: Moving Average

In [None]:
sm.graphics.tsa.plot_acf(train_organic.AveragePrice, lags=50)
plt.show()

In [None]:
sm.graphics.tsa.plot_pacf(train_organic.AveragePrice, lags=50)
plt.show()

In [None]:
train_organic.index = pd.DatetimeIndex(train_organic.index.values,
                               freq=train_organic.index.inferred_freq)

In [None]:
model = ARIMA(train_organic, order=(5,0,7))
results = model.fit()
plt.figure(figsize=(12,5))
plt.plot(train_organic);
plt.plot(results.fittedvalues, color='red');

In [None]:
print("\tMean Squared Error:", mean_squared_error(train_organic['AveragePrice'], results.fittedvalues))
print("\tRoot Mean Squared Error:", np.sqrt(mean_squared_error(train_organic['AveragePrice'],results.fittedvalues)))

In [None]:
residuals = pd.DataFrame(results.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()

In [None]:
history = [x for x in train_organic.AveragePrice]
predictions = list()
test_data = [x for x in test_organic.AveragePrice]

for i in range(len(test_data)):
    model = ARIMA(history, order=(6,0,12))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test_data[i]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

In [None]:
error = mean_squared_error(test_organic, predictions)
print("\tMean Squared Error:", mean_squared_error(test_organic, predictions))
print("\tRoot Mean Squared Error:", np.sqrt(mean_squared_error(test_organic,predictions)))

In [None]:
plt.figure(figsize=(12,5))
plt.plot(test_organic.values);
plt.plot(predictions, color='red');

## conventional

In [None]:
df_date_conventional = df[(df.type=="conventional")].groupby(['Date'])['AveragePrice'].mean()
df_date_conventional = df_date_conventional.to_frame(name='AveragePrice')
# Price
plt.figure(figsize=(12,5))
df_date_conventional['AveragePrice'].plot();

### Augmented Dickey Fuller 
We are testing as our null hypothesis that our time-series is actually non-stationary

- Null Hypothesis (H0): It is non-stationary. It has some time dependent structure.
- Alternate Hypothesis (H1): It is stationary. It does not have time-dependent structure.
- p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
- p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary. 

In [None]:
from statsmodels.tsa.stattools import adfuller

X = df_date_conventional['AveragePrice'].values
result = adfuller(X, autolag='AIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
if(result[1] > 0.05):
    print("series is non-stationary")
else:
    print("series is stationary")

### Kwiatkowski-Phillips-Schmidt-Shin (KPSS)
The null and alternate hypothesis for the KPSS test are opposite that of the ADF test.

- Null Hypothesis (H0): series is stationary.
- Alternate Hypothesis (H1): series is non-stationary.
- p-value > 0.05: Fail to reject the null hypothesis (H0), and series is stationary.
- p-value <= 0.05: Reject the null hypothesis (H0), series is non-stationary. 

In [None]:
# KPSS test
from statsmodels.tsa.stattools import kpss

X = df_date_conventional['AveragePrice'].values
statistic, p_value, n_lags, critical_values = kpss(X, nlags='auto')
print('ADF Statistic: %f' % statistic)
print('p-value: %f' % p_value)
print('num lags: %f' % n_lags)
print('Critical Values:')
for key, value in critical_values.items():
    print('\t%s: %.3f' % (key, value))

if(p_value > 0.05):
    print("series is stationary")
else:
    print("series is not stationary")

In [None]:
df_date_conventional['AveragePriceDiff'] = df_date_conventional['AveragePrice'] - df_date_conventional['AveragePrice'].shift(1)
df_date_conventional['AveragePriceDiff'].dropna().plot(figsize=(10,5));

In [None]:
from statsmodels.tsa.stattools import adfuller

X = df_date_conventional['AveragePriceDiff'].dropna().values
result = adfuller(X, autolag='AIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
if(result[1] > 0.05):
    print("series is non-stationary")
else:
    print("series is stationary")

In [None]:
# KPSS test
from statsmodels.tsa.stattools import kpss

X = df_date_conventional['AveragePriceDiff'].dropna().values
statistic, p_value, n_lags, critical_values = kpss(X, nlags='auto')
print('ADF Statistic: %f' % statistic)
print('p-value: %f' % p_value)
print('num lags: %f' % n_lags)
print('Critical Values:')
for key, value in critical_values.items():
    print('\t%s: %.3f' % (key, value))

if(p_value > 0.05):
    print("series is stationary")
else:
    print("series is not stationary")

In [None]:
train_size = int(len(df_date_conventional) * 0.75)
train_conventional, test_conventional = df_date_conventional[0:train_size], df_date_conventional[train_size:]

# Arima

- AR: Autoregression
- I: Integrated
- MA: Moving Average

In [None]:
sm.graphics.tsa.plot_acf(train_conventional.AveragePrice, lags=50)
plt.show()

In [None]:
sm.graphics.tsa.plot_pacf(train_conventional.AveragePrice, lags=50)
plt.show()

In [None]:
train_conventional.index = pd.DatetimeIndex(train_conventional.index.values,
                               freq=train_conventional.index.inferred_freq)

In [None]:
model = ARIMA(train_conventional.AveragePrice, order=(10,1,6))
results_conventional = model.fit()
plt.figure(figsize=(12,5))
plt.plot(train_conventional.AveragePrice);
plt.plot(results_conventional.fittedvalues[1:], color='red');

In [None]:
print("\tMean Squared Error:", mean_squared_error(train_conventional['AveragePrice'], results_conventional.fittedvalues))
print("\tRoot Mean Squared Error:", np.sqrt(mean_squared_error(train_conventional['AveragePrice'],results_conventional.fittedvalues)))

In [None]:
residuals = pd.DataFrame(results_conventional.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()

In [None]:
history = [x for x in train_conventional.AveragePrice]
predictions = list()
test_data = [x for x in test_conventional.AveragePrice]

for i in range(len(test_data)):
    model = ARIMA(history, order=(10,1,6))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test_data[i]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

In [None]:
print("\tMean Squared Error:", mean_squared_error(test_conventional.AveragePrice, predictions))
print("\tRoot Mean Squared Error:", np.sqrt(mean_squared_error(test_conventional.AveragePrice,predictions)))

In [None]:
plt.figure(figsize=(12,5))
plt.plot(test_conventional.AveragePrice.values);
plt.plot(predictions, color='red');

## Extra

In [None]:
from statsmodels.tsa.stattools import adfuller

X = df_date['AveragePrice'].dropna().values
result = adfuller(X)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
if(result[1] > 0.05):
    print("series is non-stationary")
else:
    print("series is stationary")

In [None]:
# KPSS test
from statsmodels.tsa.stattools import kpss

X = df_date['AveragePrice'].dropna().values
statistic, p_value, n_lags, critical_values = kpss(X)
print('ADF Statistic: %f' % statistic)
print('p-value: %f' % p_value)
print('num lags: %f' % n_lags)
print('Critical Values:')
for key, value in critical_values.items():
    print('\t%s: %.3f' % (key, value))

if(p_value > 0.05):
    print("series is stationary")
else:
    print("series is not stationary")

- Case 1: Both tests conclude that the series is not stationary - The series is not stationary
- Case 2: Both tests conclude that the series is stationary - The series is stationary
- Case 3: KPSS indicates stationarity and ADF indicates non-stationarity - The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity.
- Case 4: KPSS indicates non-stationarity and ADF indicates stationarity - The series is difference stationary. Differencing is to be used to make series stationary. The differenced series is checked for stationarity.