In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predict Trends using Google Searchs 

<img src="https://camo.githubusercontent.com/f8b517246281add898287d629b4cbda295686248/68747470733a2f2f676f6f676c65646973636f766572792e636f6d2f77702d636f6e74656e742f75706c6f6164732f676f6f676c652d73686f7070696e672e6a7067">

## Libraries used to implement the solution
List of the resources we use to design this solution:

- pandas: https://pandas.pydata.org/
- pytrends: https://matplotlib.org/
- matplotlib: https://matplotlib.org/
- statsmodels: https://www.statsmodels.org/stable/index.html
- numpy: https://numpy.org/

In [None]:
!pip install pytrends

In [None]:
from pytrends.request import TrendReq
from pylab import rcParams
from ipywidgets import interact, interactive, fixed, interact_manual

import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt

import ipywidgets as widgets
import pandas as pd
import statsmodels.api as sm
import matplotlib
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns

plt.rcParams.update({'font.size': 9})

sns.set(style="darkgrid")

pytrend = TrendReq()

In [None]:
test_samples = {'sample': ["peanut butter", "pizza", "cookie"]}

def get_google_trends_data(list_product, time_start, time_end, state=None, country='US'):
    if state:
        sigla='{}-{}'.format(country, state)
    else:
        sigla='{}'.format(country)
    
    data_composer = '{} {}'.format(time_start, time_end)
    pytrend.build_payload(kw_list=list_product, 
                                  geo=sigla, 
                                  cat=0,
                                  timeframe=data_composer)
    
    return pytrend.interest_over_time()

list_product= test_samples['sample']

## Downloading dataset

In [None]:
dataset = get_google_trends_data(list_product=list_product, time_start='2014-01-01', time_end='2020-08-01')

## Visualizing Google Trends dataset

In [None]:
def export_data_csv(dataset, save_output):
    dataset.to_csv(save_output, sep=",") 
    
del dataset['isPartial']
dataset.style.background_gradient(cmap='Greens')

## Describe dataset content

In [None]:
describe = dataset.describe()
describe.style.background_gradient(cmap='Greens')

In [None]:
def show_point_compare(df):
    plt.figure(figsize=(15, 6))
    for col in df.columns:
        plt.plot(df.index, df[col], '--')

    plt.title("Search")
    plt.legend(df.columns)
    plt.xlabel("data")
    plt.ylabel("frequency")
    plt.grid("b--")
    plt.show()

## Total search frequency

In [None]:
show_point_compare(dataset)

## Total search frequency by year

In [None]:
def show_line_compare(df):
    plt.figure(figsize=(15, 6))
    for col in df.columns:
        plt.plot(df.index, df[col])

    plt.title("Search")
    plt.legend(df.columns)
    plt.xlabel("data")
    plt.ylabel("frequency")
    plt.grid()
    plt.show() 
    
def get_media_year(ano):
    y_index = dataset[ano]
    show_line_compare(y_index)

In [None]:
get_media_year("2018")

In [None]:
get_media_year("2019")

In [None]:
get_media_year("2020")

In [None]:
def frequency_total(ano=None):
    plt.figure(figsize=(15, 6))
    if ano:
        title=f"Product by search-{ano}"
        yi = dataset[ano]
        produtos_sum = yi.sum()
        plt.pie(produtos_sum, labels=list_product, 
                 autopct='%1.1f%%', startangle=90, pctdistance=0.85, 
                shadow=True)
        centre_circle = plt.Circle((0,0),0.65,fc='white')
        fig = plt.gcf()
        fig.gca().add_artist(centre_circle)
        plt.axis('equal') 
        plt.tight_layout()

    else:
        explode = (0, 0.1, 0)
        produtos_sum = dataset.sum()
        title=f"Product by search"
        plt.pie(produtos_sum, labels=list_product, 
        autopct='%1.1f%%', startangle=90, pctdistance=0.85)
        centre_circle = plt.Circle((0,0),0.65,fc='white')
        fig = plt.gcf() 
        fig.gca().add_artist(centre_circle)
        plt.axis('equal')
        plt.tight_layout()
    
    plt.axis('equal')  
    plt.title(title)
    plt.show()

## Comparing search by items

In [None]:
frequency_total("2017")

In [None]:
frequency_total("2018")

In [None]:
frequency_total("2020")

## Tendency using Sarimax Model:
By definition, tendency is what makes someone follow a certain path or act in a certain way; predisposition, propensity.

To predict future falls or increased searches for a product, we use the SARIMAX model. SARIMAX is a statistical model widely used in time series, which is our case. SARIMAX has already been used for several purposes, which is common in the financial market for forecasting inflation.

If you want to know more about SARIMAX, we have separated a very interesting scientific article on:https://www.researchgate.net/publication/313251716_Modelling_the_demand_for_cement_The_case_of_Poland_and_Spain

About the Python statsmodels library:
statsmodels is a Python module that provides classes and functions for estimating many different statistical models, as well as for performing statistical tests and exploring statistical data. An extensive list of outcome statistics is available for each estimator. The results are tested against existing statistical packages to ensure they are correct.

https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_sarimax_stata.html

## Sarimax Model Training

frequency parameter:

- period = 365, for trends of the year
- period = 30, for month trends
- period = 7, for week's trends

In [None]:
for col in dataset.columns:
    frequencia = 7
    result = sm.tsa.seasonal_decompose(dataset[col].dropna(),
                                        period=frequencia)
    fig = result.plot()
    fig.set_figheight(9)
    fig.set_figwidth(14)
    plt.show()

In [None]:
y = dataset.resample('MS').mean()
parametros = {
                "order": (1, 1, 1),
                "seas_order": (1, 1, 0, 12)
             }

predict_model = []
for col in y.columns:
    print(f"Training SARIMAX model ... product: {col}")
    modelo = sm.tsa.statespace.SARIMAX(y[col],
                                order=parametros["order"],
                                seasonal_order=parametros["seas_order"],
                                enforce_stationarity=False,
                                enforce_invertibility=False)

    results = modelo.fit()
    print(results.summary().tables[1])
    results.plot_diagnostics(figsize=(16, 8))
    predict_model.append(results)

## Testing Sarimax Model

In [None]:
predict_after = '2020-01-01'

def bar_graph_porcent(predict_i):
    np_array = np.array(predict_i['upper'])
    variation, data_var, colors_var = (list(), list(), list())
    
    for index in range(1, len(np_array)):
        last_month = np_array[index - 1]
        current_month = np_array[index]
        
        data_var.append(predict_i.index[index])
        
        porcent_var = (current_month - last_month) * 100 / last_month
        variation.append(porcent_var)
        
        if porcent_var < 0:
            colors_var.append('red')
        else:
            colors_var.append('green')
        
    ind = np.arange(len(variation))  
    width = 0.60
    
    fig, ax = plt.subplots(figsize=(12, 7))
    rects1 = ax.bar(ind - width/2, variation, width)
    ax.axhline(y=0, color='black', linestyle='-')
    
    for i, data in zip(ax.patches, variation):
        ax.text(i.get_x()-.03, i.get_height()+.5, "%.2f" % data + "%", fontsize=15,
                color='black')
    
    for k in range(len(colors_var)):
        rects1[k].set_color(colors_var[k])

    ax.set_ylabel('Porcent predict - comparision by month')
    ax.set_xticks(ind)
    ax.set_xticklabels(data_var, rotation=90)
    ax.legend()
    
    
for model_i, col in zip(predict_model, list_product):
    predict_m = model_i.get_prediction(start=pd.to_datetime(predict_after), dynamic=False)
    predict_i = predict_m.conf_int()
    
    test_model = y['2014':][col].plot(label='real', color='green')
    
    predict_m.predicted_mean.plot(ax=test_model, color='red', label='predict', alpha=.9, figsize=(20, 7))
    
    test_model.fill_between(predict_i.index, predict_i.iloc[:, 0], predict_i.iloc[:, 1], color='k', alpha=.2)
    
    predict_i.columns = ['low', 'upper']
    print(predict_i)
    bar_graph_porcent(predict_i)
    test_model.set_xlabel('data')
    test_model.set_ylabel('frequecy')
    plt.grid()
    plt.legend()
    plt.title(f"Predict model: {col}")
    plt.show()