# Market Leader Analysis (Energy Only Offers/Awards)

In [1]:
# Import necessary packages
import pandas as pd
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import numpy as np
import calendar
import glob
from datetime import datetime
from dateutil.parser import parse
import os
import urllib.request
import seaborn as sns
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
# Handle date time conversions between pandas and matplotlib
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


#Regression & LDA
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math
%matplotlib inline

#see all columns/rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Import and Clean Data
###  strip/clean column headers, datetime conversion on:

##### Environmental: Market Data, Locational Marginal Pricing
*Third Party Environmental: NOAA Hourly Weather Data*
##### Transactional: Energy Only Offers/Awards
*Third Party Transactional: Daily NASDAQ, DOWJONES, ETF prices*

In [2]:
#ERCOT
market_df = pd.read_excel('OneDrive_1_10-22-2019/ercot_market_data.xlsx', sheet_name = 'ercot_market_data')
lmp_df = pd.read_csv('OneDrive_1_10-22-2019/ercotlmp.csv')
nodes_df = pd.read_excel('OneDrive_1_10-22-2019/ercot_nodes.xlsx')
all_awards_df = pd.concat([pd.read_csv(f) for f in glob.glob('OneDrive_1_10-22-2019/EnergyOnlyOfferAwards/*.csv')], ignore_index = True)

all_awards_df.columns = all_awards_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace(' - ',' ')
market_df.columns = market_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
lmp_df.columns = lmp_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
nodes_df.columns = nodes_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')

#drop unnecessary columns
market_df = market_df.drop(columns=['datetime','year'])
lmp_df = lmp_df.drop(columns=['datetime','year'])
nodes_df = nodes_df.drop(columns=['iso','weatherstationid','first_dart_date','last_dart_date','equipment','voltage',
                                    'substation','nodetype','zoneid','objectid'])

#Convert marketday feature from datetime type to string type
market_df['marketday'] = market_df['marketday'].dt.strftime('%m/%d/%Y')

#3rd party data
weather_df_1 = pd.read_csv('additional_data/weather_data_1.csv')
weather_df_2 = pd.read_csv('additional_data/weather_data_2.csv')
weather_df_3 = pd.read_csv('additional_data/weather_data_3.csv')
nasdaq_df = pd.read_csv('additional_data/nasdaq_data.csv')
etf_df = pd.read_csv('additional_data/etf_data.csv')
dowjones_df = pd.read_csv('additional_data/dow_jones_data.csv')

weather_df_1.columns = weather_df_1.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
weather_df_2.columns = weather_df_2.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
weather_df_3.columns = weather_df_3.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
nasdaq_df.columns = nasdaq_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
etf_df.columns = etf_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
dowjones_df.columns = dowjones_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')

#Select hourly weather data 
weather_df_1_trim = weather_df_1[['station','date','hourlydrybulbtemperature','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywinddirection','hourlywindspeed']]
weather_df_2_trim = weather_df_2[['station','date','hourlydrybulbtemperature','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywinddirection','hourlywindspeed']]
weather_df_3_trim = weather_df_3[['station','date','hourlydrybulbtemperature','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywinddirection','hourlywindspeed']]


#rename columns, drop 'close' column and use adjusted close column 'adj_close'
nasdaq_df = nasdaq_df.rename(columns={'date':'nasdaq_date','open':'nasdaq_open','high':'nasdaq_high',
                                     'low':'nasdaq_low','close':'nasdaq_close','adj_close':'nasdaq_adj_close','volume':'nasdaq_volume'})
nasdaq_df = nasdaq_df.drop(columns='nasdaq_close')
etf_df = etf_df.rename(columns={'date':'etf_date','open':'etf_open','high':'etf_high','low':'etf_low',
                                     'close':'etf_close','adj_close':'etf_adj_close','volume':'etf_volume'})
etf_df = etf_df.drop(columns='etf_close')
dowjones_df = dowjones_df.rename(columns={'date':'dowjones_date','open':'dowjones_open','high':'dowjones_high',
                                     'low':'dowjones_low','close':'dowjones_close','adj_close':'dowjones_adj_close','volume':'dowjones_volume'})
dowjones_df = dowjones_df.drop(columns='dowjones_close')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Displaying Content of ERCOT Energy Only Awards

In [4]:
all_awards_df.head()

Unnamed: 0,delivery_date,hour_ending,settlement_point,qse_name,energy_only_offer_award_in_mw,settlement_point_price,offer_id
0,07/21/2019,1,AEEC,QAUTO2,12.0,11.0,NPvWjz5dECAn
1,07/21/2019,1,AEEC,QJACIN,0.0,11.0,Macq9896
2,07/21/2019,1,AEEC,QREUEL,24.0,11.0,jb1
3,07/21/2019,1,AMISTAD_ALL,QAUTO2,10.0,20.55,uB67RehzUeWd
4,07/21/2019,1,AMISTAD_ALL,QJACIN,2.2,20.55,Macq9896


## Select Top 10 Leaders

In [5]:
#Select Leader Data from QSE Names of top 10 leaders
lead_awd = all_awards_df.loc[all_awards_df['qse_name'].isin(['QREUEL','QFPLE4','QLEPPM','QRAID',
                                                                   'QSHELL','QEXELO','QTRAIL','QWAKE','QEDF26','QTTPAL'])]
#Select Data within timeline presented in Jeff's Power BI Dashboard (Jan 3, 2019 - July 12, 2019)
lead_awd['date'] = pd.to_datetime(lead_awd['delivery_date'])
mask = (lead_awd['date'] >= '2019-01-02') & (lead_awd['date'] < '2019-07-13')
awards = lead_awd.loc[mask]

#lead_awd is the base of the merging section to create a model-ready dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## Join Tables, Create 'Data' Table for Models

In [8]:
#Join Market Condition & Location Pricing & Node Data
market_lmp = lmp_df.merge(market_df, how = 'left', on = ['marketday','hourending','peaktype','month'])
market_lmp_nodes = market_lmp.merge(nodes_df, how='left', left_on='settlementpoint', right_on='nodename')
awards = awards.rename(columns={'delivery_date':'marketday','hour_ending':'hourending','settlement_point':'settlementpoint'})

data_ercot = awards.merge(market_lmp_nodes, how = 'left', on = ['marketday','hourending','settlementpoint'])
data_ercot = data_ercot.drop(columns=['nodename'])
data_ercot.loc[(data_ercot.nearest_weatherstation == 'TM - Nuevo Laredo/Intl'),'nearest_weatherstation']='TX - Laredo/Intl'
data_ercot = data_ercot.fillna(0.0)
data_ercot.loc[(data_ercot.zone == 0.0),'zone']='Other'

In [9]:
stocks_df = nasdaq_df.merge(etf_df, how='left',left_on='nasdaq_date',right_on='etf_date')
stocks_df = stocks_df.merge(dowjones_df, how='left',left_on='nasdaq_date',right_on='dowjones_date')
stocks_df = stocks_df.drop(columns=['dowjones_date','etf_date'])
stocks_df = stocks_df.rename(columns={'nasdaq_date':'date'})
stocks_df['date'] = pd.to_datetime(stocks_df['date'],infer_datetime_format=True)
stocks_df['date'] = stocks_df['date'].dt.strftime('%m/%d/%Y')

In [10]:
weather_df = pd.concat([weather_df_1_trim, weather_df_2_trim,weather_df_3_trim])

weather_df['hourlydrybulbtemperature'] = pd.to_numeric(weather_df['hourlydrybulbtemperature'], errors='coerce', downcast=None)
weather_df['hourlystationpressure'] = pd.to_numeric(weather_df['hourlystationpressure'], errors='coerce', downcast=None)
weather_df['hourlywinddirection'] = pd.to_numeric(weather_df['hourlywinddirection'], errors='coerce', downcast=None)
weather_df['hourlywindspeed'] = pd.to_numeric(weather_df['hourlywindspeed'], errors='coerce', downcast=None)

weather_df['date'] = pd.to_datetime(weather_df['date'],infer_datetime_format=True)
weather_df['hourending'] = [d.time() for d in weather_df['date']]
mask = (weather_df['date'] >= '2019-01-02') & (weather_df['date'] < '2019-07-13')
weather_df = weather_df.loc[mask]
hours = [math.ceil((t.hour * 60 + t.minute) / 60) for t in weather_df['hourending']]
weather_df['hour'] = hours
weather_df['hour']= weather_df['hour'].apply(str).apply(int)
weather_df = weather_df.loc[(weather_df['hour'] > 0)]
weather_df['date'] = weather_df['date'].dt.strftime('%m/%d/%Y')
c_maxes = weather_df.groupby(['station', 'date','hour']).hourending.transform(max)
weather_df = weather_df.loc[weather_df.hourending == c_maxes]
weather_df['station'] = weather_df['station'].map({72267023042: 'TX - Lubbock/Intl',
                                                                 72251012924: 'TX - Corpus Christi/Intl',
                                                                 72266013962: 'TX - Abilene/Municipal', 
                                                                 72250012919: 'TX - Brownsville/Intl', 
                                                                 72351013966: 'TX - Wichita Falls/Sheppard AFB',
                                                                 72261022010: 'TX - Del Rio/Intl',
                                                                 72265023023: 'TX - Midland-Odessa',
                                                                 72253012921: 'TX - San Antonio/Intl',
                                                                 72363023047: 'TX - Amarillo/Intl',
                                                                 72248013957: 'LA - Shreveport/Regional',
                                                                 72263023034: 'TX - San Angelo/Mathis',
                                                                 72265623040: 'TX - Wink/Winkler County',
                                                                 72258013960: 'TX - Dallas/Love Field',
                                                                 72243012960: 'TX - Houston/Intercontinental',
                                                                 72261823091: 'TX - Fort Stockton',
                                                                 72252012907: 'TX - Laredo/Intl',
                                                                 74641013975: 'OK - Gage/Shattuck',
                                                                 72259303985: 'TX - Dallas-Fort Worth/Intl'})


In [11]:
weather_df=weather_df.drop_duplicates(keep='first')

In [12]:
stocks_weather_df = weather_df.merge(stocks_df, how = 'left', on='date')
stocks_weather_df = stocks_weather_df.fillna(0.00)
stocks_weather_df=stocks_weather_df.drop_duplicates(keep='first')

In [13]:
stocks_weather_df.shape

(113588, 24)

In [14]:
data_ercot.shape

(138748, 24)

In [15]:
data = data_ercot.merge(stocks_weather_df.drop_duplicates(['date','hour','station']), 
                       how = 'left', 
                       left_on = ['marketday','hourending','nearest_weatherstation'], 
                       right_on=['date','hour','station'])


In [16]:
data.shape

(138748, 48)

### Create Evaluation Criterion: PnL

In [17]:
#Create PnL column for Performance Measurement/Evaluation Criterion
data['PnL'] = (data.dalmp-data.rtlmp) * data.energy_only_offer_award_in_mw
#Separate Leaders into dataframes for separate modeling
leaders_QREUEL = data.loc[data['qse_name'] == 'QREUEL']
leaders_QFPLE4 = data.loc[data['qse_name'] == 'QFPLE4']
leaders_QLEPPM = data.loc[data['qse_name'] == 'QLEPPM']
leaders_QRAID = data.loc[data['qse_name'] == 'QRAID']
leaders_QSHELL = data.loc[data['qse_name'] == 'QSHELL']
leaders_QEXELO = data.loc[data['qse_name'] == 'QEXELO']
leaders_QTRAIL = data.loc[data['qse_name'] == 'QTRAIL']
leaders_QWAKE = data.loc[data['qse_name'] == 'QWAKE']
leaders_QEDF26 = data.loc[data['qse_name'] == 'QEDF26']
leaders_QTTPAL = data.loc[data['qse_name'] == 'QTTPAL']

In [None]:
leaders_QREUEL['PnL'].sum() #Power Bi: 782,010.73
leaders_QFPLE4['PnL'].sum() #Power Bi: 544,545.72
leaders_QLEPPM['PnL'].sum() #Power Bi: 267,344.34
leaders_QRAID['PnL'].sum() #Power Bi: 193,016.14
leaders_QSHELL['PnL'].sum() #Power Bi: 179,504.23
leaders_QEXELO['PnL'].sum() #Power Bi: 155,802.45
leaders_QTRAIL['PnL'].sum() #Power Bi: 148,386.28
leaders_QWAKE['PnL'].sum() #Power Bi: 145,813.26
leaders_QEDF26['PnL'].sum() #Power Bi: 143,823.99
leaders_QTTPAL['PnL'].sum() #Power BI: 140,899.54
PnL


# Model Test 1: Linear Regression

In [None]:
leaders_QREUEL_X = leaders_QREUEL_X.fillna(0)
reg = LinearRegression().fit(leaders_QREUEL_X,leaders_QREUEL_y)
reg.coef_
reg.score(leaders_QREUEL_X, leaders_QREUEL_y)