# Introduction
## Using Google Search Volume and News Sentiment to Predict Natural Gas Prices with LSTMs
by Quinn Murphey, Adrian Ramos, and Gabriel Soliz

In [16]:
import pandas as pd
import math
from datetime import date, datetime
from dateutil.relativedelta import relativedelta

# Data Fetch Stage

## US EIA NYMEX Dataset

In [43]:
# Read csvs to dataframes
nymex_spot_df = pd.read_csv("data/US_EIA_NYMEX_spot.csv")
nymex_futures_df = pd.read_csv("data/US_EIA_NYMEX_futures.csv")

# Turn Date column into a datetime object rather than string
nymex_spot_df["Date"] = nymex_spot_df["Date"].map(lambda str : datetime.strptime(str, "%b %d, %Y"))
nymex_futures_df["Date"] = nymex_futures_df["Date"].map(lambda str : datetime.strptime(str, "%b %d, %Y"))

# Trim to Jan 2013 - Jun 2019
nymex_spot_df = nymex_spot_df[(nymex_spot_df["Date"] >= datetime(year=2013, month=1, day=1)) & (nymex_spot_df["Date"] <= datetime(year=2019, month=6, day=30))]
nymex_futures_df = nymex_futures_df[(nymex_futures_df["Date"] >= datetime(year=2013, month=1, day=1)) & (nymex_futures_df["Date"] <= datetime(year=2019, month=6, day=30))]

# Reset index
nymex_spot_df = nymex_spot_df.reset_index(drop=True)
nymex_futures_df = nymex_futures_df.reset_index(drop=True)


In [44]:
nymex_spot_df

Unnamed: 0,Date,Henry Hub Natural Gas Spot Price (Dollars per Million Btu)
0,2013-01-02,3.30
1,2013-01-03,3.19
2,2013-01-04,3.20
3,2013-01-07,3.30
4,2013-01-08,3.21
...,...,...
1648,2019-06-24,2.31
1649,2019-06-25,2.31
1650,2019-06-26,2.34
1651,2019-06-27,2.32


In [45]:
nymex_futures_df

Unnamed: 0,Date,Natural Gas Futures Contract 1 (Dollars per Million Btu),Natural Gas Futures Contract 2 (Dollars per Million Btu),Natural Gas Futures Contract 3 (Dollars per Million Btu),Natural Gas Futures Contract 4 (Dollars per Million Btu)
0,2013-01-02,3.233,3.255,3.294,3.349
1,2013-01-03,3.198,3.214,3.250,3.306
2,2013-01-04,3.287,3.303,3.337,3.393
3,2013-01-07,3.266,3.281,3.318,3.374
4,2013-01-08,3.218,3.234,3.275,3.330
...,...,...,...,...,...
1633,2019-06-24,2.303,2.284,2.256,2.291
1634,2019-06-25,2.308,2.286,2.260,2.294
1635,2019-06-26,2.291,2.268,2.244,2.275
1636,2019-06-27,2.324,2.298,2.325,2.413


## Google Trends Scraper

In [6]:
import pytrends
from pytrends.request import TrendReq
import time
pytrend = TrendReq()

def get_daily_trends_as_df(keywords, categories, timeframe, countries, search_type):
    # Get pytrend suggestions and store them in exact_keywords
    keywords_codes = [pytrend.suggestions(keyword=i)[0] for i in keywords] 
    df_CODES= pd.DataFrame(keywords_codes)
    exact_keywords = df_CODES['mid'].to_list()

    # Store keywords alongside their respective categories
    individual_exact_keyword = list(zip(*[iter(exact_keywords)]*1, categories))
    individual_exact_keyword = [list(x) for x in individual_exact_keyword]

    # Split timeframe into 6 month chunks
    timeframe_start, timeframe_end = timeframe.split(" ")
    start_date = date.fromisoformat(timeframe_start)
    end_date = date.fromisoformat(timeframe_end)
    dates = []
    while start_date < end_date:
        dates.append(start_date)
        start_date = start_date + relativedelta(months=6)
    dates.append(end_date + relativedelta(days=1))

    # Compute number of fetches required
    max_fetches = len(countries) * len(keywords) * (len(dates) - 1)
    curr_fetches = 0

    trend_dict = {}
    for country in countries:
        for keyword, category in individual_exact_keyword:
            trend_dict[keyword] = pd.DataFrame()
            for i in range(len(dates)-1):
                pytrend.build_payload(kw_list=[keyword], 
                                    timeframe = dates[i].isoformat() + " " + (dates[i + 1] - relativedelta(days=1)).isoformat(), 
                                    geo = country, 
                                    cat=category,
                                    gprop=search_type)
                curr_fetches += 1
                print("[" + "=" * math.floor(20 * curr_fetches / max_fetches - 1) +  ">" * min(math.floor(20 * curr_fetches / max_fetches), 1) + " " * (20 - math.floor(20 * curr_fetches / max_fetches)) + "] ",end='')
                print(f"{math.floor(10000 * curr_fetches / max_fetches)/100}% Complete!", end='\r')
                time.sleep(15) # sleep to prevent google shutting us down
                trend_dict[keyword] = pd.concat([trend_dict[keyword], pytrend.interest_over_time()], axis=0)
            i+=1
    df_trends = pd.concat(trend_dict, axis=1)

    df_trends.columns = df_trends.columns.droplevel(0) #drop outside header
    df_trends = df_trends.drop('isPartial', axis = 1) #drop "isPartial"
    df_trends.reset_index(level=0,inplace=True) #reset_index
    df_trends.columns = ['date'] + keywords

    return df_trends

In [7]:
KEYWORDS            = ["Natural Gas","Oil","Coal","Nuclear Power","Wind Power","Hydroelectric","Solar Power","Gold","Silver","Platinum","Copper","Biofuel","Recession","CPI"]
KEYWORDS_CATEGORIES = [904,          904,  904,   0,               0,          0,               0,           904,   904,     904,       904,     0,        0,          0]
TIMEFRAME='2013-01-01 2019-06-30' # Jan 2013 - June 2019
COUNTRIES=["US"] # ISO country code
SEARCH_TYPE='' #default is 'web searches',others include 'images','news','youtube','froogle' (google shopping)

try:
    google_trends_df = pd.read_csv("data/google_trends_dataset.csv")
    google_trends_df = google_trends_df.drop(google_trends_df.columns[0], axis=1)
except:
    google_trends_df = get_daily_trends_as_df(KEYWORDS, KEYWORDS_CATEGORIES, TIMEFRAME, COUNTRIES, SEARCH_TYPE)
    google_trends_df.to_csv("data/google_trends_dataset.csv")

google_trends_df

Unnamed: 0,date,Natural Gas,Oil,Coal,Nuclear Power,Wind Power,Hydroelectric,Solar Power,Gold,Silver,Platinum,Copper,Biofuel,Recession,CPI
0,2013-01-01,8,37,0,18,43,15,40,18,35,33,27,34,53,16
1,2013-01-02,62,95,0,20,60,28,52,23,50,31,72,47,49,55
2,2013-01-03,54,81,0,23,52,18,56,23,45,15,42,55,41,45
3,2013-01-04,68,78,27,19,54,23,51,23,43,100,58,37,51,45
4,2013-01-05,7,68,0,15,33,11,46,23,34,36,54,15,25,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367,2019-06-26,25,44,0,41,20,6,67,87,75,43,79,31,35,59
2368,2019-06-27,28,56,0,35,30,23,66,77,74,68,68,22,44,60
2369,2019-06-28,30,55,0,38,20,10,66,79,67,79,73,31,38,50
2370,2019-06-29,8,28,59,37,18,15,69,59,58,44,67,14,27,25


## Financial Times Scraper

# Data Preprocessing Stage

## NYMEX

### Fill spot price voids 
For each void, fill it with the average of the prior data point and the following datapoint.

### Join dataframes (by date)

### Normalize columns
First log all values: $x' = \log{x}$, then normalize: $x' = (x - \mu)/\sigma$

### Fill weekend voids linearly
Saturday = 1/3 (Monday - Friday)\
Sunday   = 2/3 (Monday - Friday)

## Google Trends

### Normalize Columns 
$x' = (x- \mu)/\sigma$

# Model Training Stage

# Model Testing Stage