In [None]:
!pip install nb_black -q

In [None]:
%load_ext nb_black

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
avocado_price = pd.read_csv("../input/avocado-prices/avocado.csv")
avocado_price.drop(
    [
        "Unnamed: 0",
        "Total Volume",
        "4046",
        "4225",
        "4770",
        "Total Bags",
        "Small Bags",
        "Large Bags",
        "XLarge Bags",
        "year",
    ],
    inplace=True,
    axis=1,
)

avocado_price.Date = pd.to_datetime(avocado_price.Date)
avocado_price = avocado_price[avocado_price.type.isin(["conventional"])]
avocado_price = avocado_price.groupby(["Date"]).mean()
avocado_price["Date"] = avocado_price.index
avocado_price.reset_index(inplace=True, drop=True)
avocado_price.columns=['Price', 'Date']
avocado_price.head()


In [None]:
price_oil = pd.read_csv(
    "../input/brent-oil-prices/BrentOilPrices.csv", dtype={"Price": float}
)
print("price_oil shape ->", price_oil.shape)
price_oil.head()

There is a lot of lines here, and to be honnest we can analyse only one item per time... Making this a process or a function, if we change the itemid everthing will works perfectly...

## Translate the items name

# Simply data

In [None]:
def transform_dataset(df):
    df.Date = pd.to_datetime(df.Date)
    df.sort_values("Date", inplace=True)
    df.reset_index(inplace=True, drop=True)
    return df


price_oil = transform_dataset(price_oil)
price_avd = transform_dataset(avocado_price)

# EAD

In [None]:
import plotly.graph_objects as go
import plotly.express as px


def analysi_basic_statistical(data, title):
    fig = go.Figure(
        data=[
            go.Table(
                header=dict(values=["Parameter", "Price ($)"], font=dict(size=20)),
                cells=dict(
                    values=[
                        list(data.describe().index),
                        list(data.describe().round(2)["Price"]),
                    ],
                    align="left",
                    height=30,
                    font=dict(size=15),
                ),
            )
        ]
    )
    fig.update_layout(
        width=600, showlegend=False, title_text=title,
    )
    fig.show()
    return None


analysi_basic_statistical(price_oil, "Statistical information for Oil")

In [None]:
analysi_basic_statistical(price_avd, "Statistical information for Avocado")

In [None]:
import plotly.graph_objects as go
import plotly.express as px


def analysi_historical(df, title):
    fig = px.line(df, x="Date", y="Price")
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list(
                [
                    dict(count=1, label="1 month", step="month", stepmode="backward"),
                    dict(count=3, label="3 months", step="month", stepmode="backward"),
                    dict(count=6, label="6 months", step="month", stepmode="backward"),
                    dict(count=1, label="1 year", step="year", stepmode="backward"),
                    dict(count=2, label="2 years", step="year", stepmode="backward"),
                    dict(count=4, label="4 years", step="year", stepmode="backward"),
                    dict(step="all"),
                ]
            )
        ),
    )
    fig.update_layout(title_text=title, title_font_size=20)
    fig.show()
    return None


analysi_historical(price_oil, "Historical oil price in american dollars ($)")

In [None]:
analysi_historical(price_avd, "Historical avocado in american dollars ($)")

## Autocorrelation and Partial Autocorrelation

### Autocorrelation
Autocorrelation, also known as serial correlation, is the correlation of a signal with a delayed copy of itself as a function of delay. Informally, it is the similarity between observations as a function of the time lag between them. The analysis of autocorrelation is a mathematical tool for finding repeating patterns, such as the presence of a periodic signal obscured by noise, or identifying the missing fundamental frequency in a signal implied by its harmonic frequencies. It is often used in signal processing for analyzing functions or series of values, such as time domain signals. 

[Source - Wikipedia](https://en.wikipedia.org/wiki/Autocorrelation)

### Partial Correlation
In probability theory and statistics, partial correlation measures the degree of association between two random variables, with the effect of a set of controlling random variables removed. If we are interested in finding to what extent there is a numerical relationship between two variables of interest, using their correlation coefficient will give misleading results if there is another, confounding, variable that is numerically related to both variables of interest. This misleading information can be avoided by controlling for the confounding variable, which is done by computing the partial correlation coefficient. This is precisely the motivation for including other right-side variables in a multiple regression; but while multiple regression gives unbiased results for the effect size, it does not give a numerical value of a measure of the strength of the relationship between the two variables of interest.

[Source - Wikipedia](https://en.wikipedia.org/wiki/Partial_correlation)


[Sugestion to read](https://dzone.com/articles/autocorrelation-in-time-series-data)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot as plt


def plot_correlations(serie):
    fig, ax = plt.subplots(figsize=(20, 5))
    _ = plot_acf(serie, ax=ax)
    fig, ax = plt.subplots(figsize=(20, 5))
    _ = plot_pacf(serie, ax=ax)

### Autocorrelation and Particial Autocorrelation for Oil

In [None]:
plot_correlations(price_oil["Price"])

### Autocorrelation and Particial Autocorrelation for Avocado

In [None]:
plot_correlations(price_avd["Price"])

## Seasonality

In time series data, seasonality is the presence of variations that occur at specific regular intervals less than a year, such as weekly, monthly, or quarterly. Seasonality may be caused by various factors, such as weather, vacation, and holidays and consists of periodic, repetitive, and generally regular and predictable patterns in the levels of a time series.

[Source - Wikipedia](https://en.wikipedia.org/wiki/Seasonality)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose


def seasonal_decompose_and_graph(
    data,
    seasonal_type="additive",
    period=52,
    date_column="Date",
    value_column="Price",
    title="Seasonal analysis",
):
    res = seasonal_decompose(data[value_column], model=seasonal_type, period=period)
    df = {
        "observed": res.observed,
        "trend": res.trend,
        "seasonal": res.seasonal,
        "resid": res.resid,
    }
    res = pd.DataFrame(df)
    res.head()

    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    # Defining variables
    fig = make_subplots(shared_xaxes=True, rows=4, cols=1)
    x = data[date_column]
    y = res.observed
    z = res.trend
    k = res.seasonal
    w = res.resid

    # Ploting the lines
    fig.append_trace(go.Scatter(x=x, y=y,), row=1, col=1)
    fig.append_trace(go.Scatter(x=x, y=z,), row=2, col=1)
    fig.append_trace(go.Scatter(x=x, y=k), row=3, col=1)
    fig.append_trace(go.Scatter(x=x, y=w), row=4, col=1)

    # Update properties and descriptions
    fig.update_layout(height=700, width=1400, title_text=title)
    fig.update_xaxes(title_text="Date", row=4, col=1)
    fig.update_yaxes(title_text="Observed", row=1, col=1)
    fig.update_yaxes(title_text="Trend", row=2, col=1)
    fig.update_yaxes(title_text="Seasonal", row=3, col=1)
    fig.update_yaxes(title_text="Resid/Noise", row=4, col=1)
    fig.show()


seasonal_decompose_and_graph(
    price_oil, period=5, title="Seasonal decomposition for oil price."
)

In [None]:
seasonal_decompose_and_graph(
    price_avd, period=52, title="Seasonal decomposition for avocado price."
)

## Moving average

In statistics, a moving average (rolling average or running average) is a calculation to analyze data points by creating a series of averages of different subsets of the full data set. It is also called a moving mean (MM) or rolling mean and is a type of finite impulse response filter. Variations include: simple, and cumulative, or weighted forms (described below).

[Source - Wikipedia](https://en.wikipedia.org/wiki/Moving_average)

In [None]:
import plotly.graph_objects as go


def moving_averange(
    data, delay=51, date_column="Date", value_column="Price", title="Price"
):
    # Create figure
    fig = go.Figure()

    x = data[date_column]

    # Add traces, one for each slider step
    for step in np.arange(1, delay, 1):
        y = data[value_column].rolling(step).mean().values
        fig.add_trace(
            go.Scatter(
                visible=False,
                line=dict(color="#3C5074", width=1),
                name="WS=" + str(step),
                x=x,
                y=y,
            )
        )

    # Make 10th trace visible
    fig.data[5].visible = True

    # Create and add slider
    steps = []
    for i in range(len(fig.data)):
        step = dict(
            method="update",
            args=[
                {"visible": [False] * len(fig.data)},
                {
                    "title": "Moving average with window size: "
                    + str(i)
                    + ", for "
                    + title
                },
            ],  # layout attribute
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [
        dict(
            active=10,
            currentvalue={"prefix": "Window Size : "},
            pad={"t": 111},
            steps=steps,
        )
    ]

    fig.update_layout(sliders=sliders)

    fig.show()


moving_averange(price_oil, title="oil price")

In [None]:
moving_averange(price_avd, delay=7, title="avocado price")

## Machine Learning - Regression

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### Prophet by Facebook

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

[Site - Link](https://facebook.github.io/prophet/)

In [None]:
from fbprophet import Prophet


def format_to_prophet(serie_ds, serie_y):
    aux = pd.DataFrame()
    aux["ds"] = serie_ds
    aux["y"] = serie_y
    return aux


def train_predict(
    data, periods, kind, freq="W", plot=False, yearly_seasonality=False, cps=1
):
    model = Prophet(yearly_seasonality=yearly_seasonality, changepoint_prior_scale=cps)
    model.fit(data[:-periods])

    future = model.make_future_dataframe(
        periods=periods, freq=freq, include_history=True
    )
    forecast = model.predict(future)

    r2 = round(r2_score(data["y"], forecast["yhat"]), 3)
    mse = round(mean_squared_error(data["y"], forecast["yhat"]), 3)
    mae = round(mean_absolute_error(data["y"], forecast["yhat"]), 3)

    if plot:
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=forecast["ds"],
                y=forecast["yhat"],
                mode="lines",
                name="Predict Values",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=forecast["ds"], y=forecast["trend"], mode="lines", name="Trend"
            )
        )
        fig.add_trace(
            go.Scatter(x=data["ds"], y=data["y"], mode="lines", name="Real Values",)
        )
        fig.update_layout(
            title_text=f"Comperating the real x predicted for car sales",
            yaxis_title=f"Sales",
            xaxis_title="Date",
        )

        fig.show()
        print("R2: ", r2)
        print("MSE: ", mse)
        print("MAE: ", mae)
    else:
        return {"CPS": cps, "R2": r2, "MSE": mse, "MAE": mae}




### Oil

In [None]:
from joblib import Parallel, delayed

cps_options = [round(x, 1) for x in np.linspace(start=0.1, stop=10, num=50)]

prediction_size = 50
data_fb = format_to_prophet(price_oil.Date, price_oil.Price)

results = Parallel(n_jobs=-1, verbose=10)(
    delayed(train_predict)(
        data=data_fb,
        periods=prediction_size,
        freq="D",
        kind="Oil",
        plot=False,
        cps=i,
        yearly_seasonality=True,
    )
    for i in cps_options
)

results = pd.DataFrame(results)
results = results[results.R2.isin([max(results.R2)])]
results = results[results.MSE.isin([min(results.MSE)])]
results

In [None]:
forecast = train_predict(
    data=data_fb,
    periods=prediction_size,
    freq="D",
    kind="Oil",
    plot=True,
    cps=results.CPS.iloc[0],
    yearly_seasonality=True,
)

### Avocado

In [None]:
prediction_size = 10
data_fb = format_to_prophet(price_avd.Date, price_avd.Price)

results = Parallel(n_jobs=-1, verbose=10)(
    delayed(train_predict)(
        data=data_fb,
        periods=prediction_size,
        freq="W",
        kind="Oil",
        plot=False,
        cps=i,
        yearly_seasonality=True,
    )
    for i in cps_options
)

results = pd.DataFrame(results)
results = results[results.R2.isin([max(results.R2)])]
results = results[results.MSE.isin([min(results.MSE)])]
results

In [None]:
forecast = train_predict(
    data=data_fb,
    periods=prediction_size,
    freq="W",
    kind="Avocado",
    plot=True,
    cps=results.CPS.iloc[0],
    yearly_seasonality=True,
)