In [7]:
from pprint import pprint
from pathlib import Path
import requests
import json
import pandas as pd
import csv
from dotenv import load_dotenv
import os
import hvplot.pandas
import calendar
import datetime
import bokeh

In [8]:
# read in the stocks (inventory) data .csv
inventorycsv = Path('../data/clean_data/corn_stocks_seasonal.csv')

inventory_df = pd.read_csv(inventorycsv, delimiter=',', parse_dates=True, infer_datetime_format=True)

FileNotFoundError: [Errno 2] File b'..\\data\\clean_data\\corn_stocks_seasonal.csv' does not exist: b'..\\data\\clean_data\\corn_stocks_seasonal.csv'

In [3]:
# check out the header columns
inventory_df.tail()

Unnamed: 0,date,stock_report
78,2001-12-01,1040485000
79,2000-03-01,689660000
80,2000-06-01,437336000
81,2000-09-01,230059000
82,2000-12-01,966809000


In [4]:
# check for nulls and decide whether to drop any drop empty columns
inventory_df.isnull().sum()

# no null data

date            0
stock_report    0
dtype: int64

In [5]:
# Plot inventory levels over the past year
base_inv_plot = inventory_df.hvplot(kind='bar', title='Corn Inventory Volumes over Time', x='date', xlabel='Date', xticks=25, y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45)
base_inv_plot

In [6]:
# build inventory data exponential moving averages (1 year and 5 year)
window_1year = 4
window_5year = 20

ewma1_plot = inventory_df.ewm(span=window_1year).mean().hvplot(color='lightgreen', title='Corn Inventory Volumes over Time', x='date', xlabel='Date', xticks=25, y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)
ewma5_plot = inventory_df.ewm(span=window_5year).mean().hvplot(title='Corn Inventory Volumes over Time', x='date', xlabel='Date', xticks=25, y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)

base_inv_plot * ewma1_plot * ewma5_plot

Not a great measure...will be statistically more relevant to look at the different quarters and compare to the previous quarters.

In [6]:
# build 4 dataframes to compare the 4 different quarters ()
months=[]
years=[]

for dates in inventory_df['date']:
    year = datetime.datetime.strptime(dates, '%Y-%m-%d').year
    month = calendar.month_name[datetime.datetime.strptime(dates, '%Y-%m-%d').month]
    months.append(month)
    years.append(year)

inventory_df['month'] = months
inventory_df['year'] = years

sept_df = inventory_df[inventory_df['month'] == 'September'].sort_values('date', ascending=True)
mar_df = inventory_df[inventory_df['month'] == 'March'].sort_values('date', ascending=True)
jun_df = inventory_df[inventory_df['month'] == 'June'].sort_values('date', ascending=True)
dec_df = inventory_df[inventory_df['month'] == 'December'].sort_values('date', ascending=True)

sept_plot = sept_df.hvplot(title='September Inventory Report Through Time', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', xformatter=bokeh.models.formatters.DatetimeTickFormatter(), yformatter='%.0f', rot=45, attr_labels=True)

mar_plot = mar_df.hvplot(title='March Inventory Report Through Time', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', xformatter=bokeh.models.formatters.DatetimeTickFormatter(), yformatter='%.0f', rot=45, attr_labels=True)

jun_plot = jun_df.hvplot(title='June Inventory Report Through Time', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', xformatter=bokeh.models.formatters.DatetimeTickFormatter(), yformatter='%.0f', rot=45, attr_labels=True)

dec_plot = dec_df.hvplot(title='December Inventory Report Through Time', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', xformatter=bokeh.models.formatters.DatetimeTickFormatter(), yformatter='%.0f', rot=45, attr_labels=True)

NameError: name 'inventory_df' is not defined

In [8]:
# calculate the moving averages for each season

# window_1year = int(input('First EMA length'))
# window_5year = int(input('Second EMA length'))

window_1year = 1
window_5year = 5

# March
mar_ewma1_plot = mar_df.ewm(span=window_1year).mean().hvplot(color='red', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)
mar_ewma5_plot = mar_df.ewm(span=window_5year).mean().hvplot(color='lightgreen', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)

# June
jun_ewma1_plot = jun_df.ewm(span=window_1year).mean().hvplot(color='red', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)
jun_ewma5_plot = jun_df.ewm(span=window_5year).mean().hvplot(color='lightgreen', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)

# September
sept_ewma1_plot = sept_df.ewm(span=window_1year).mean().hvplot(color='red', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)
sept_ewma5_plot = sept_df.ewm(span=window_5year).mean().hvplot(color='lightgreen', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)

# December
dec_ewma1_plot = dec_df.ewm(span=window_1year).mean().hvplot(color='red', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)
dec_ewma5_plot = dec_df.ewm(span=window_5year).mean().hvplot(color='lightgreen', x='date', xlabel='Date', y='stock_report', ylabel='Inventory, Bu', yformatter='%.0f', rot=45, legend=True)


In [9]:
# March Plot
mar_plot * mar_ewma1_plot * mar_ewma5_plot

In [10]:
# June Plot
jun_plot * jun_ewma1_plot * jun_ewma5_plot

In [11]:
# Septemeber Plot
sept_plot * sept_ewma1_plot * sept_ewma5_plot

In [12]:
# December Plot
dec_plot * dec_ewma1_plot * dec_ewma5_plot

# Cleaning excel files for monthly production numbers

In [14]:
# use python to open the path of the directory which contains the WASDE reports
# create a blank dataframe for the clean data to go into
wasde_data = os.listdir(Path('../data/raw_data/wasde_data'))
clean_data = pd.DataFrame(columns=['date', 'beginning_stocks', 'production', 'ending_stocks'])

# loop through the excel files and extract the data into the clean_data df
for data in wasde_data:
    wasde = Path(f'../data/raw_data/wasde_data/{data}')
    wasde_df = pd.read_excel(wasde, sheet_name='Page 12')

    if type(wasde_df.iloc[0][2]) == type('string'):
        # the data is in the older format (pre June 2012)
        wasde_df = pd.read_excel(wasde, sheet_name='Page 12', skiprow=[0], header=[1,31])
        if 'Unnamed' in wasde_df.columns[2][1]:
            # format 1 (pre September 2010)
            wasde_df = pd.read_excel(wasde, sheet_name='Page 12', skiprow=[0], header=[1,30])
            report_date = wasde_df.columns[2][0]
            report_real_date = datetime.datetime.isoformat(datetime.datetime.strptime(wasde_df.columns[2][0], '%B %Y'))

            clean_data = clean_data.append({
                'date': report_real_date, 
                'beginning_stocks': wasde_df[report_date].iloc[7, 4], 
                'production': wasde_df[report_date].iloc[8, 4],
                'ending_stocks': wasde_df[report_date].iloc[17, 4]},
                ignore_index=True).sort_values('date', ascending=False)
        else:
            # format 2 (September 2010-May 2012)
            report_date = wasde_df.columns[2][0]
            report_real_date = datetime.datetime.isoformat(datetime.datetime.strptime(wasde_df.columns[2][0], '%B %Y'))

            clean_data = clean_data.append({
                'date': report_real_date, 
                'beginning_stocks': wasde_df[report_date].iloc[7, 4], 
                'production': wasde_df[report_date].iloc[8, 4],
                'ending_stocks': wasde_df[report_date].iloc[17, 4]},
                ignore_index=True).sort_values('date', ascending=False)
    else:
        # the data is in the new format
        wasde_df = pd.read_excel(wasde, sheet_name='Page 12', header=[0, 29], skip_footer=2).tail(17).reset_index(drop=True)
        report_date = wasde_df.columns[0][0]
        report_real_date = datetime.datetime.isoformat(datetime.datetime.strptime(wasde_df.columns[0][0], '%B %Y'))

        clean_data = clean_data.append({
            'date': report_real_date, 
            'beginning_stocks': wasde_df[report_date].iloc[5,4], 
            'production': wasde_df[report_date].iloc[6, 4],
            'ending_stocks': wasde_df[report_date].iloc[15, 4]},
            ignore_index=True).sort_values('date', ascending=False)

clean_data.set_index('date', inplace=True)
clean_data.dropna(inplace=True)

# write the df to a new .csv file in the clean_data directory
df_writer = Path('../data/clean_data/monthly_wasde_reports.csv')
clean_data.to_csv(df_writer)

# Graph up the cleaned production data

In [178]:
def wasde_monthly_ma():
    # pull in the data from the consolidated wasde monthly reports
    clean_data_path = Path('../Data/clean_data/monthly_wasde_reports.csv')

    # convert the date to datetimes and set as index
    clean_df = pd.read_csv(clean_data_path, parse_dates=True, infer_datetime_format=True)
    clean_df['date'] = pd.to_datetime(clean_df['date'])

    clean_df = clean_df.set_index('date')

    # create our base-line production plot
    prod_plot = (clean_df['production']/1000).hvplot(title='WASDE Monthly Predicted Production', ylabel='Production, Billion Bu', rot=45, legend='bottom_right', xticks=9)
    prod_plot

    # monthly data, so 1 year = 12 * 1 and 5 year = 12 * 5
    window1 = 12 * 1
    window5 = 12 * 5

    # create the exponential moving average data
    ewm1 = pd.DataFrame((clean_df['production']/1000).ewm(span=window1).mean())
    ewm1.rename(columns={'production':'EMA 1-year'}, inplace=True)
    ewm5 = pd.DataFrame((clean_df['production']/1000).ewm(span=window5).mean())
    ewm5.rename(columns={'production':'EMA 5-year'}, inplace=True)

    # create the ema plots
    ewm1_plot = ewm1['EMA 1-year'].hvplot(title='WASDE Monthly Predicted Production', ylabel='Production, Billion Bu', line_dash='dashed', line_width=2.5, legend='bottom_right', color='darkorange')
    ewm5_plot = ewm5['EMA 5-year'].hvplot(title='WASDE Monthly Predicted Production', ylabel='Production, Billion Bu', color='magenta', line_dash='dashed', line_width=2.5, legend='bottom_right')

    # return the combined plot of moving averages
    return(prod_plot * ewm1_plot * ewm5_plot).opts(frame_height=300, show_legend=False)


In [179]:
wasde_monthly_ma()