# ทำนายผลผลิตข้าวโพด โดยใช้ผลผลิตปีก่อน ๆ ด้วย Prophet

https://facebook.github.io/prophet/docs/quick_start.html

In [1]:
import numpy as np
import pandas as pd
from fbprophet import Prophet

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from fbprophet.plot import plot_plotly, plot_components_plotly

py.init_notebook_mode(connected=True)

Get data

In [2]:
yield_file = 'OAE-process/OAE-ผลผลิตข้าวโพดทั้งหมด.xlsx'
df = pd.read_excel(yield_file)

areas = df.groupby(['area']).sum().sort_values(by='value', ascending=False)

In [3]:
areas[:30]['value'].sum() / areas['value'].sum()

0.9948643526098973

In [4]:
def data_in_area(df, area):
    df = df.sort_values(by='date', ascending=False)
    df_area = df[df.area == area].reset_index()
    df_area = df_area[['date', 'value']]
    df_area.columns = ['ds', 'y']
    return df_area

In [5]:
def resample_year2month(df):
    df = df.set_index('date')
    df.index.name = 'ds'
    return df.resample('MS').asfreq().fillna(method='ffill')

In [6]:
cost_file = 'OAE-process/OAE-ต้นทุนรวมต่อไร่ข้าวโพดเลี้ยงสัตว์.xlsx'
df_cost = pd.read_excel(cost_file)
df_cost_spl = resample_year2month(df_cost)

precipitation_file = 'OAE-process/OAE-ปริมาณน้ำฝน-ฝนตก.xlsx'
df_precipitation = pd.read_excel(precipitation_file)
df_precipitation_nan = data_in_area(df_precipitation,'Nan').set_index('ds')

rainday_file = 'OAE-process/OAE-ปริมาณน้ำฝน-จำนวนวันฝนตก.xlsx'
df_rainday = pd.read_excel(rainday_file)
df_rainday_nan = data_in_area(df_rainday,'Nan').set_index('ds')

price_file = 'OAE-process/OAE-ราคาข้าวโพดเลี้ยงสัตว์.xlsx'
df_price = pd.read_excel(price_file)
df_price_spl = df_price.set_index('date')

Util functions

In [7]:
def add_features(df):
    df = df.join(df_cost_spl['value'], on='ds').rename(columns={'value':'cost'})
    df = df.join(df_precipitation_nan['y'].rename('precipitation'), on='ds')
    df = df.join(df_rainday_nan['y'].rename('rainday'), on='ds')
    df = df.join(df_price_spl['value'], on='ds').rename(columns={'value':'price'})
    return df.sort_values(by='ds', ascending=False).fillna(method='bfill')

def is_harvest_season(ds):
    date = pd.to_datetime(ds)
    return (date.month >= 10 and date.month <= 12)

# Tuning using Grid Search

In [8]:
def one_province_predict(Province, **kwags):
    df_yield = data_in_area(df, Province)
    
    df_train, df_test = df_yield[12:], df_yield[:12]
    df_train = add_features(df_train)
    df_test = add_features(df_test)
    df_train['on_season'] = df_train['ds'].apply(is_harvest_season)
    
    model = Prophet(weekly_seasonality=False,
                    daily_seasonality=False,
                    changepoint_prior_scale=kwags['changepoint_prior_scale'],
                    seasonality_prior_scale=kwags['seasonality_prior_scale'], 
                    holidays_prior_scale=kwags['holidays_prior_scale'])
    model.add_seasonality(name='monthly_on_season', 
                          period=30.5, 
                          fourier_order=kwags['fourier_order'], 
                          condition_name='on_season')
    
    for col in ['cost', 'precipitation', 'rainday', 'price']:
        model.add_regressor(col, 
                            prior_scale=kwags['prior_scale'], 
                            mode='multiplicative')

    model.fit(df_train)

    future = model.make_future_dataframe(periods=12, freq="MS")
    future = add_features(future)
    future['on_season'] = future['ds'].apply(is_harvest_season)

    forecast = model.predict(future)
    forecast.yhat.clip(lower=0, inplace=True)
    forecast = forecast.sort_values(by='ds', ascending=False).reset_index().drop(columns=['index'])
    
    # Sanity check that we are evaluating the forecase at the same date
    assert(np.all(forecast[:12]['ds'].to_numpy() == df_test['ds'].to_numpy()))
    # Calculate root mean squared error.
    rmse = np.sqrt(np.mean((forecast[:12]['yhat'].to_numpy() - df_test['y'].to_numpy())**2))
    
    mae = np.mean(np.abs(forecast[:12]['yhat'].to_numpy() - df_test['y'].to_numpy()))
    return rmse, mae

In [9]:
def all_provinces_predict(**kwags):
    rmses = []
    maes = []
    for province in areas.index[:30]:
        rmse, mae = one_province_predict(Province=province, **kwags)
        rmses.append(rmse)
        maes.append(mae)
    return rmses, maes

In [10]:
# Python
import itertools
import numpy as np
import pandas as pd

param_grid = {
    'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
    'holidays_prior_scale': [5, 10, 20],
    'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
    'fourier_order': [3, 5, 7],
    'prior_scale': [0.1, 1, 5]
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # Store the RMSEs for each params here
province_rmses = []
maes = []  # Store the MAEs for each params here
province_maes = []

# Use cross validation to evaluate all parameters
for params in all_params:
    %time province_rmse, province_mae = all_provinces_predict(**params)
    province_rmses.append(province_rmse)
    rmses.append(sum(province_rmse))
    province_maes.append(province_mae)
    maes.append(sum(province_mae))
    print(f'Param={params}, \t RMSE={sum(province_rmse)/len(province_rmse):.3f}, MAE={sum(province_mae)/len(province_mae):.3f}')

# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
tuning_results['mae'] = maes
print(tuning_results)

CPU times: user 46.5 s, sys: 788 ms, total: 47.3 s
Wall time: 47.3 s
Param={'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 5, 'seasonality_prior_scale': 0.01, 'fourier_order': 3, 'prior_scale': 0.1}, 	 RMSE=16673.077, MAE=13141.966
CPU times: user 1min 24s, sys: 1.2 s, total: 1min 25s
Wall time: 1min 25s
Param={'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 5, 'seasonality_prior_scale': 0.01, 'fourier_order': 3, 'prior_scale': 1}, 	 RMSE=18908.053, MAE=11822.249
CPU times: user 1min 46s, sys: 752 ms, total: 1min 46s
Wall time: 1min 46s
Param={'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 5, 'seasonality_prior_scale': 0.01, 'fourier_order': 3, 'prior_scale': 5}, 	 RMSE=19299.130, MAE=12022.899
CPU times: user 1min 7s, sys: 712 ms, total: 1min 7s
Wall time: 1min 7s
Param={'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 5, 'seasonality_prior_scale': 0.01, 'fourier_order': 5, 'prior_scale': 0.1}, 	 RMSE=16676.669, MAE=13094.330
CPU times: user 1m

In [12]:
tuning_results.to_csv('corn_yield_prophet-tuning_results.csv')