## CSX Weekly Freight Reports

In [1]:
from tabula import read_pdf
import pandas as pd
import altair as alt
import numpy as np

import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
    
def fetchVolumeReportUrls(page=1):
    baseUrl = 'https://www.csx.com/index.cfm/investors/volume-trends/?page={}'
    
    try:
        driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver',
                                  options=chrome_options)
    except WebDriverException:
        driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver',
                          options=chrome_options)
        
    driver.get(baseUrl.format(page))
        
    pageString = driver.page_source
    
    driver.close()
    
    # <a href="http://csx.gcs-web.com/static-files/02ae9432-408a-4537-97c2-d0e0d9716e1b" target="_blank" class="new_window">2019 Week 27 AAR</a>
    return re.findall("\<a href=\"([^\"]*)\" target=\"_blank\" class=\"new_window\"\>([^<]*)", pageString)

def parseReport(urlAndWeek):
    try:
        url, weekAndYear = urlAndWeek
    
        # '2013 Week 29'
        year, week = weekAndYear.split(' Week ')
        
    except ValueError:
        print(urlAndWeek)
        return None
    
    #            WEEK   28                QUARTER TO DATE           YEAR TO DATE
    baseCols = ['Cargo', 'Week {0} {1}', 'Week {0} {2}', 'Week {0} pct-change',
                'Week {0} {1} QTD', 'Week {0} {2} QTD', 'Week {0} QTD pct-change',
                'Week {0} {1} YTD', 'Week {0} {2} YTD', 'Week {0} YTD pct-change']

    cols = [c.format(week.split(' ')[0], int(year), int(year)-1) for c in baseCols]
    
    try:
        tmp = read_pdf(url, pages='all', pandas_options={ 'names': cols })
        df = pd.concat(tmp) if isinstance(tmp, list) else tmp
        return df.iloc[1:, :].set_index('Cargo').applymap(lambda v: float(re.sub('[^0-9.]', '', v)))
    except AttributeError:
        print(urlAndWeek)
        return None
    except ValueError:
        print(urlAndWeek)
        return None
        
reportUrls = [fetchVolumeReportUrls(page=p) for p in range(1, 9)]

flatten = lambda l: [item for sublist in l for item in sublist]

reportUrls = flatten(reportUrls)
    
#parseReport(('http://csx.gcs-web.com/static-files/f2baf981-9658-41f8-9ec0-400e9fdbde11', '2013 Week 29')).head(30)

#parseReport(('http://csx.gcs-web.com/static-files/2d4f737e-1314-46aa-9e8c-a29b40be2a2b', '2019 Week 49 AAR')).head(30)

In [2]:
%%time

dfs = map(parseReport, reportUrls)

merged = pd.concat(dfs, axis=1, sort=False).T

merged.head()

The output file is empty.


('http://csx.gcs-web.com/static-files/942ad9b6-158b-4db6-a5fd-9dea04c0373c', '2016 Week 25 AAR')
CPU times: user 8.12 s, sys: 2.81 s, total: 10.9 s
Wall time: 12min 17s


Unnamed: 0,Grain,Grain Mill Products,Farm Products (excl. Grain),Food Products,Chemicals,Petroleum & Petroleum Products,Primary Metal Products,Primary Forest Products,Lumber & Wood Products,Pulp & Paper Products,...,Non‐Metallic Minerals (incl. Phosphates),Primary Forest Products*,"Farm Products, Ex. Grain",Food & Kindred Products,Petroleum Products,Metals & Products,"Pulp, Paper & Allied Products",Phosphate (Phospate Rock),Automotive (Motor Vehicles & Equip.),"Military, Machinery & Transportation Equipment"
Week 5 2020,3443.0,2255.0,202.0,2191.0,9634.0,3886.0,2828.0,649.0,1321.0,3305.0,...,,,,,,,,,,
Week 5 2019,2603.0,1723.0,205.0,1827.0,8744.0,2965.0,2345.0,652.0,1174.0,2929.0,...,,,,,,,,,,
Week 5 pct-change,32.3,30.9,1.5,19.9,10.2,31.1,20.6,0.5,12.5,12.8,...,,,,,,,,,,
Week 5 2020 QTD,15901.0,10385.0,964.0,10027.0,48867.0,17658.0,12604.0,3136.0,5366.0,16136.0,...,,,,,,,,,,
Week 5 2019 QTD,14513.0,9819.0,1211.0,9489.0,47304.0,16430.0,13599.0,2804.0,5289.0,15319.0,...,,,,,,,,,,


In [3]:
merged.to_csv('./csx-data.csv', sep='|')

#merged.tail()

In [4]:
subset = merged.filter(axis='index', regex='[0-9]$').copy()
#subset.head()

In [5]:
subset['dt'] = subset.index.map(lambda v: pd.to_datetime(v.replace(' AAR', '') + ' 6', format='Week %W %Y %w'))

#subset.tail()

In [6]:
print(subset.dtypes)

Grain                                                    float64
Grain Mill Products                                      float64
Farm Products (excl. Grain)                              float64
Food Products                                            float64
Chemicals                                                float64
Petroleum & Petroleum Products                           float64
Primary Metal Products                                   float64
Primary Forest Products                                  float64
Lumber & Wood Products                                   float64
Pulp & Paper Products                                    float64
Non-Metallic Minerals (incl. Phosphates)                 float64
Crushed Stone, Sand & Gravel                             float64
Stone, Clay & Glass Products                             float64
Iron & Steel Scrap                                       float64
Waste & Nonferrous Scrap                                 float64
Motor Vehicles and Parts 

In [7]:
def doChart(metric, df, color='grey'):
    pmp = df[['dt', metric]].sort_values('dt').copy()
    pmp2 = pmp.set_index('dt')[metric].resample('M').mean().reset_index().dropna()
    pmp2['Change'] = pmp2[metric].pct_change(12).apply(lambda v: 100 * v)

    c = alt.Chart(pmp2).mark_bar(color=color).encode(
        alt.X('dt:T', axis=alt.Axis(title='')),
        alt.Y('Change:Q', axis=alt.Axis(title='52-Week Growth [%]')),
        tooltip=[alt.Tooltip('dt:T', format='%B %Y', title='Period'), alt.Tooltip('Change:Q')]
    ).properties(
        title='CSX Freight Reports: {}'.format(metric),
        height=450,
        width=750,
        background='white'
    )
    
    return c.display()
    
doChart('Primary Metal Products', subset)

In [8]:
metric = 'Lumber & Wood Products'
doChart(metric, subset, color='tan')

In [9]:
metric = 'Total Traffic'
doChart(metric, subset, color='navy')

In [10]:
metric = 'Total Carloads'
doChart(metric, subset, color='navy')

In [11]:
metric = 'Total Intermodal'
doChart(metric, subset, color='navy')

In [12]:
metric = 'Automotive (Motor Vehicles & Equip.)'
metric = 'Motor Vehicles and Parts'
doChart(metric, subset, color='purple')

In [13]:
metric = 'Containers'
doChart(metric, subset, color='navy')

In [14]:
metric = 'Coal'
doChart(metric, subset, color='black')

In [15]:
metric = 'Chemicals'
doChart(metric, subset, color='orange')

In [16]:
metric = 'Food Products'
doChart(metric, subset, color='green')

In [17]:
metric = 'Farm Products (excl. Grain)'
doChart(metric, subset, color='brown')

In [18]:
metric = 'Grain'
doChart(metric, subset, color='tan')

In [19]:
metric = 'Grain Mill Products'
doChart(metric, subset, color='tan')

In [20]:
metric = 'Grain Mill Products'
doChart(metric, subset, color='black')