# Download FOMC Data (Template)
Scrape data from the Federal Reserve website and archives. Also a basic Colab Environment template.

## Environment

In [1]:
# -*- coding: utf-8 -*-

# ENVIRONMENT CHECK:
import sys, os, inspect, site, pprint
# Check whether in Colab:
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB == True:
  print('YES, this is a Google Colaboratory environment.')
else:
  print('NO, this is not a Google Colaboratory environment.')
print(' ')

# Python installation files:
stdlib = os.path.dirname(inspect.getfile(os))
python_version = !python --version
print('Python Standard Library is located in:\n' + stdlib)
print(' ')
print('This environment is using {}'.format(str(python_version[0])))
print(' ')
print('Local system packages are located in:')
pprint.pprint(site.getsitepackages())
print(' ')
print('Local user packages are located in:\n' + site.getusersitepackages())

# Installed packages:
#!pip list -v
#!pip list --user -v


YES, this is a Google Colaboratory environment.
 
Python Standard Library is located in:
/usr/lib/python3.6
 
This environment is using Python 3.6.9
 
Local system packages are located in:
['/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/lib/python3.6/dist-packages']
 
Local user packages are located in:
/root/.local/lib/python3.6/site-packages


In [2]:
# Mount Google Drive:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
# Navigate to project folder:
%cd "/content/drive/MyDrive/Colab Notebooks/proj2/src"
!ls -al

/content/drive/MyDrive/Colab Notebooks/proj2/src
total 11881
-rw------- 1 root root   30563 Jan 25 08:42 0_FOMC_Analysis_Download_Data.ipynb
-rw------- 1 root root 1297662 Jan 25 08:42 1_FOMC_Analysis_Preliminary.ipynb
-rw------- 1 root root 1881863 Jan 25 06:22 2_FOMC_Analysis_Preprocess_NonText.ipynb
-rw------- 1 root root   98359 Jan 25 07:09 3_FOMC_Analysis_Preprocess_Text.ipynb
-rw------- 1 root root 2659457 Jan 25 00:59 4_FOMC_Analysis_EDA_FE_NonText.ipynb
-rw------- 1 root root 1282658 Jan 25 00:59 5_FOMC_Analysis_Baseline.ipynb
-rw------- 1 root root 4138442 Jan 25 06:30 6_FOMC_Analysis_Model_Train.ipynb
-rw------- 1 root root  358128 Jan 24 19:04 7_FOMC_Analysis_Sentence.ipynb
-rw------- 1 root root  355706 Jan 25 08:41 7_FOMC_Corpora.ipynb
drwx------ 2 root root    4096 Jan 25 07:52 C:
drwx------ 2 root root    4096 Nov  6 04:45 data
drwx------ 2 root root    4096 Jan 25 06:54 final
-rw------- 1 root root    7138 Jan 25 08:27 FomcGetCalendar.py
drwx------ 2 root root    4096 

In [4]:
# Define Path Variables:
if IN_COLAB:
  employment_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Employment/'
  cpi_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/CPI/'
  fed_rates_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/FEDRates/'
  fx_rates_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/FXRates/'
  gdp_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/GDP/'
  ism_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/ISM/'
  sales_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Sales/'
  treasury_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Treasury/'
  fomc_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/FOMC/'
  preprocessed_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/preprocessed/'
  train_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/train_data/'
  output_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/result/'
  keyword_lm_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/LoughranMcDonald/'
  glove_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/GloVe/'
  model_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/models/'

else:
  employment_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Employment/'
  cpi_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/CPI/'
  fed_rates_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/FEDRates/'
  fx_rates_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/FXRates/'
  gdp_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/GDP/'
  ism_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/ISM/'
  sales_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Sales/'
  treasury_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Treasury/'
  fomc_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/FOMC/'
  preprocessed_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/preprocessed/'
  train_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/train_data/'
  output_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/result/'
  keyword_lm_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/LoughranMcDonald/'
  glove_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/GloVe/'
  model_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/models/'

## Packages

In [None]:
if IN_COLAB:
  # Uninstall existing versions:
  !pip uninstall bs4 -y
  !pip uninstall textract -y
  !pip uninstall numpy -y
  !pip uninstall pandas -y
  !pip uninstall requests -y
  !pip uninstall tqdm -y
  !pip uninstall nltk -y
  !pip uninstall quandl -y
  !pip uninstall scikit-plot -y
  !pip uninstall seaborn -y
  !pip uninstall sklearn -y
  !pip uninstall torch -y
  !pip uninstall transformers -y
  !pip uninstall wordcloud -y
  !pip uninstall xgboost -y
  
  # Install packages:
  !pip install bs4==0.0.1
  !pip install textract==1.6.3
  !pip install numpy==1.19.4
  !pip install pandas==1.1.4
  !pip install requests==2.24.0
  !pip install tqdm==4.51.0
  !pip install nltk==3.5
  !pip install quandl==3.5.3
  !pip install scikit-plot==0.3.7
  !pip install seaborn==0.11.0
  !pip install sklearn==0.0
  !pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
  !pip install transformers==3.5.0
  !pip install wordcloud==1.8.0
  !pip install xgboost==1.2.1
#  os.kill(os.getpid(), 9)


Uninstalling bs4-0.0.1:
  Successfully uninstalled bs4-0.0.1
Uninstalling numpy-1.19.5:
  Successfully uninstalled numpy-1.19.5
Uninstalling pandas-1.1.5:
  Successfully uninstalled pandas-1.1.5
Uninstalling requests-2.23.0:
  Successfully uninstalled requests-2.23.0
Uninstalling tqdm-4.41.1:
  Successfully uninstalled tqdm-4.41.1
Uninstalling nltk-3.2.5:
  Successfully uninstalled nltk-3.2.5
Uninstalling seaborn-0.11.1:
  Successfully uninstalled seaborn-0.11.1
Uninstalling sklearn-0.0:
  Successfully uninstalled sklearn-0.0
Uninstalling torch-1.7.0+cu101:
  Successfully uninstalled torch-1.7.0+cu101
Uninstalling wordcloud-1.5.0:
  Successfully uninstalled wordcloud-1.5.0
Uninstalling xgboost-0.90:
  Successfully uninstalled xgboost-0.90
Collecting bs4==0.0.1
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l

Collecting numpy==1.19.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/86/753182c9085ba4936c0076269a571613387cdb77ae2bf537448bfd63472c/numpy-1.19.4-cp36-cp36m-manylinux2010_x86_64.whl (14.5MB)
[K     |████████████████████████████████| 14.5MB 165kB/s 
[31mERROR: xarray 0.15.1 requires pandas>=0.25, which is not installed.[0m
[31mERROR: torchvision 0.8.1+cu101 requires torch==1.7.0, which is not installed.[0m
[31mERROR: torchtext 0.3.1 requires requests, which is not installed.[0m
[31mERROR: torchtext 0.3.1 requires torch, which is not installed.[0m
[31mERROR: torchtext 0.3.1 requires tqdm, which is not installed.[0m
[31mERROR: thinc 7.4.0 requires tqdm<5.0.0,>=4.10.0, which is not installed.[0m
[31mERROR: tensorflow-datasets 4.0.1 requires requests>=2.19.0, which is not installed.[0m
[31mERROR: tensorflow-datasets 4.0.1 requires tqdm, which is not installed.[0m
[31mERROR: tensorboard 2.4.0 requires requests<3,>=2.21.0, which is not installed.[0m
[31m

Collecting pandas==1.1.4
[?25l  Downloading https://files.pythonhosted.org/packages/4d/51/bafcff417cd857bc6684336320863b5e5af280530213ef8f534b6042cfe6/pandas-1.1.4-cp36-cp36m-manylinux1_x86_64.whl (9.5MB)
[K     |████████████████████████████████| 9.5MB 5.1MB/s 
[31mERROR: pymc3 3.7 requires tqdm>=4.8.4, which is not installed.[0m
[31mERROR: pandas-datareader 0.9.0 requires requests>=2.19.0, which is not installed.[0m
[31mERROR: google-colab 1.0.0 requires requests~=2.23.0, which is not installed.[0m
[31mERROR: fix-yahoo-finance 0.0.22 requires requests, which is not installed.[0m
[31mERROR: fbprophet 0.7.1 requires tqdm>=4.36.1, which is not installed.[0m
[31mERROR: fastai 1.0.61 requires requests, which is not installed.[0m
[31mERROR: fastai 1.0.61 requires torch>=1.0.0, which is not installed.[0m
[31mERROR: google-colab 1.0.0 has requirement six~=1.15.0, but you'll have six 1.12.0 which is incompatible.[0m
Installing collected packages: pandas
Successfully installed

Collecting requests==2.24.0
[?25l  Downloading https://files.pythonhosted.org/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61kB)
[K     |█████▎                          | 10kB 8.8MB/s eta 0:00:01[K     |██████████▋                     | 20kB 12.3MB/s eta 0:00:01[K     |████████████████                | 30kB 6.0MB/s eta 0:00:01[K     |█████████████████████▏          | 40kB 6.3MB/s eta 0:00:01[K     |██████████████████████████▌     | 51kB 3.9MB/s eta 0:00:01[K     |███████████████████████████████▉| 61kB 4.5MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.7MB/s 
[31mERROR: torchtext 0.3.1 requires torch, which is not installed.[0m
[31mERROR: torchtext 0.3.1 requires tqdm, which is not installed.[0m
[31mERROR: tensorflow-datasets 4.0.1 requires tqdm, which is not installed.[0m
[31mERROR: spacy 2.2.4 requires tqdm<5.0.0,>=4.38.0, which is not installed.[0m
[31mERROR: kaggle 1.5.10 re

### Inspect Packages

In [None]:
!pip list -v
!pip list --user -v


### Import Packages

In [None]:
# Python libraries
import pprint
import datetime as dt
import re
import io
import os
import pickle
from tqdm.notebook import tqdm
import time
import logging
import random
from collections import defaultdict, Counter
import xgboost as xgb
import codecs
pprint.pprint(sys.path)

# Data Science modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="darkgrid")
#plt.style.use('ggplot')

# Import Scikit-learn models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, plot_confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, StratifiedKFold, learning_curve, RandomizedSearchCV
import scikitplot as skplt

# Import nltk modules and download dataset
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize

# Import Pytorch modules
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.autograd import Variable
from torch.optim import Adam, AdamW

# Import Transformers
#from transformers import *
from transformers import BertTokenizer, BertForSequenceClassification, BertModel


### Select Hardware Accelerator

In [None]:
## Use TPU Runtime:
#if IN_COLAB:
#  assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook setting > Hardware accelerator'  
#  VERSION = "20200220"
#  !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#  !python pytorch-xla-env-setup.py --version $VERSION

In [None]:
## Use GPU Runtime:
#if IN_COLAB:
#  if torch.cuda.is_available():
#    torch.cuda.get_device_name(0)
#    gpu_info = !nvidia-smi
#    gpu_info = '\n'.join(gpu_info)
#    print(gpu_info)
#  else:
#    print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, and then re-execute this cell.')
#    #os.kill(os.getpid(), 9)  


### Finalize Setup

In [None]:
#  Download nltk dataset
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop = set(stopwords.words('english'))


In [None]:
# Set logger
logger = logging.getLogger('mylogger')
logger.setLevel(logging.INFO)

timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime())
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s')

fh = logging.FileHandler('log_model.txt')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

In [None]:
# Set Random Seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
rand_seed = 42

In [None]:
# Set Seaborn Style
sns.set(style='white', context='notebook', palette='deep')

## Load preprocessed data

In [None]:
if IN_COLAB:
  employment_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Employment/'
  cpi_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/CPI/'
  fed_rates_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/FEDRates/'
  fx_rates_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/FXRates/'
  gdp_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/GDP/'
  ism_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/ISM/'
  sales_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Sales/'
  treasury_data_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/MarketData/Treasury/'
  fomc_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/FOMC/'
  preprocessed_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/preprocessed/'
  train_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/train_data/'
  output_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/result/'
  keyword_lm_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/LoughranMcDonald/'
  glove_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/GloVe/'
  model_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/models/'
  graph_dir = '/content/drive/My Drive/Colab Notebooks/proj2/src/data/graphs/'
else:
  employment_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Employment/'
  cpi_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/CPI/'
  fed_rates_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/FEDRates/'
  fx_rates_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/FXRates/'
  gdp_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/GDP/'
  ism_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/ISM/'
  sales_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Sales/'
  treasury_data_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/MarketData/Treasury/'
  fomc_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/FOMC/'
  preprocessed_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/preprocessed/'
  train_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/train_data/'
  output_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/result/'
  keyword_lm_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/LoughranMcDonald/'
  glove_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/GloVe/'
  model_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/models/'
  graph_dir = 'C:/Users/theon/GDrive/Colab Notebooks/proj2/src/data/graphs/'

## Save Data

In [None]:
if IN_COLAB:
  def save_data(df, file_name, dir_name=train_dir, index_csv=True):
    if not os.path.exists(dir_name):
      os.mkdir(dir_name)
    # Save results to a picke file
    file = open(dir_name + file_name + '.pickle', 'wb')
    pickle.dump(df, file)
    file.close()
    print('Successfully saved {}.pickle. in {}'.format(file_name, dir_name + file_name + '.pickle'))
    # Save results to a csv file
    df.to_csv(dir_name + file_name + '.csv', index=True)
    print('Successfully saved {}.csv. in {}'.format(file_name, dir_name + file_name + '.csv'))

else:
  def save_data(df, file_name, dir_name=train_dir):
    # Save results to a .picke file
    file = open(dir_name + file_name + '.pickle', 'wb')
    pickle.dump(df, file)
    file.close()
    print('Successfully saved {}.pickle. in {}'.format(file_name, dir_name + file_name + '.pickle'))
    # Save results to a .csv file
    df.to_csv(dir_name + file_name + '.csv', index=True)
    print('Successfully saved {}.csv. in {}'.format(file_name, dir_name + file_name + '.csv'))

# FomcGetData Testing:

## Packages

### Inspect Packages

In [None]:
!cat fomc_get_data/FomcBase.py

In [None]:
!cat fomc_get_data/FomcMeetingScript.py

In [None]:
!cat fomc_get_data/FomcMinutes.py

In [None]:
!cat fomc_get_data/FomcSpeech.py

In [None]:
!cat fomc_get_data/FomcStatement.py

In [None]:
!cat fomc_get_data/FomcTestimony.py

In [None]:
!cat pdf2text.py

### Import Packages

In [None]:
from fomc_get_data.FomcStatement import FomcStatement
from fomc_get_data.FomcMinutes import FomcMinutes
from fomc_get_data.FomcMeetingScript import FomcMeetingScript
from fomc_get_data.FomcPresConfScript import FomcPresConfScript
from fomc_get_data.FomcSpeech import FomcSpeech
from fomc_get_data.FomcTestimony import FomcTestimony


## Download text data

### Inspect Script

In [None]:
# Inspect script:
!cat FomcGetData.py

### Execution arguments

In [None]:
# Execute Script:
!python FomcGetData.py all 1980
#!python FomcGetData.py all 1980 > FomcGetData_debug.txt # Save output for debugging

# FOMCGetCalendar Testing

## Packages

### Inspect Script

In [None]:
# Inspect script:
!cat FomcGetCalendar.py

## Download calendar data

### Execution arguments

In [None]:
# Execute Script:
!python FomcGetCalendar.py 1980
#!python FomcGetCalendar.py 1980 > FomcGetCalendar_debug.txt # Save output for debugging