# Market Leader Analysis (PTP Obligation Bid Offers/Awards)

In [1]:
# Import necessary packages
import pandas as pd
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import numpy as np
import calendar
import glob
from datetime import datetime
from dateutil.parser import parse
import os
import urllib.request
import seaborn as sns
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
# Handle date time conversions between pandas and matplotlib
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.linear_model import LinearRegression
#Logistic Regression & LDA
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math
%matplotlib inline

#see all columns/rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Import and Clean Data
###  strip/clean column headers, datetime conversion on:

##### Environmental: Market Data, Locational Marginal Pricing
*Third Party Environmental: NOAA Hourly Weather Data*
##### Transactional: Energy Only Offers/Awards
*Third Party Transactional: Daily NASDAQ, DOWJONES, ETF prices*

In [2]:
#ERCOT
market_df = pd.read_excel('OneDrive_1_10-22-2019/ercot_market_data.xlsx', sheet_name = 'ercot_market_data')
lmp_df = pd.read_csv('OneDrive_1_10-22-2019/ercotlmp.csv')
nodes_df = pd.read_excel('OneDrive_1_10-22-2019/ercot_nodes.xlsx')
#PTP Obligation Bids
ptp_bids_df = pd.concat([pd.read_csv(f) for f in glob.glob('OneDrive_1_10-22-2019/PTPObligationBids/*.csv')], ignore_index = True)
#Awarded PTP OBligation Bids 
ptp_awards_df = pd.concat([pd.read_csv(f) for f in glob.glob('OneDrive_1_10-22-2019/PTPObligationBidAwards/*.csv')], ignore_index = True)

ptp_awards_df.columns = ptp_awards_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace(' - ',' ')
ptp_bids_df.columns = ptp_bids_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace(' - ',' ')
market_df.columns = market_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
lmp_df.columns = lmp_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
nodes_df.columns = nodes_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
#Convert marketday feature from datetime type to string type
market_df['marketday'] = market_df['marketday'].dt.strftime('%m/%d/%Y')
ptp_awards_df = ptp_awards_df.rename(columns={'ptp_bid_award_-_mw':'ptp_bid_award_mv', 
                                                        'ptp_bid_-_price':'ptp_bid_price'})

#3rd party data
weather_df_1 = pd.read_csv('additional_data/weather_data_1.csv')
weather_df_2 = pd.read_csv('additional_data/weather_data_2.csv')
weather_df_3 = pd.read_csv('additional_data/weather_data_3.csv')
nasdaq_df = pd.read_csv('additional_data/nasdaq_data.csv')
etf_df = pd.read_csv('additional_data/etf_data.csv')
dowjones_df = pd.read_csv('additional_data/dow_jones_data.csv')

weather_df_1.columns = weather_df_1.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
weather_df_2.columns = weather_df_2.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
weather_df_3.columns = weather_df_3.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
nasdaq_df.columns = nasdaq_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
etf_df.columns = etf_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')
dowjones_df.columns = dowjones_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','')

#trim
weather_df_1_trim = weather_df_1[['station','date','hourlydewpointtemperature','hourlydrybulbtemperature',
                                'hourlyprecipitation','hourlypressuretendency','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywetbulbtemperature','hourlywinddirection',
                                'hourlywindgustspeed','hourlywindspeed']]
weather_df_2_trim = weather_df_2[['station','date','hourlydewpointtemperature','hourlydrybulbtemperature',
                                'hourlyprecipitation','hourlypressuretendency','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywetbulbtemperature','hourlywinddirection',
                                'hourlywindgustspeed','hourlywindspeed']]
weather_df_3_trim = weather_df_3[['station','date','hourlydewpointtemperature','hourlydrybulbtemperature',
                                'hourlyprecipitation','hourlypressuretendency','hourlyrelativehumidity',
                                'hourlystationpressure','hourlywetbulbtemperature','hourlywinddirection',
                                'hourlywindgustspeed','hourlywindspeed']]
#rename columns
nasdaq_df = nasdaq_df.rename(columns={'date':'nasdaq_date','open':'nasdaq_open','high':'nasdaq_high',
                                     'low':'nasdaq_low','close':'nasdaq_close','adj Close':'nasdaq_adj_close',
                                     'volume':'nasdaq_volume'})
etf_df = etf_df.rename(columns={'date':'etf_date','open':'etf_open','high':'etf_high','low':'etf_low',
                                     'close':'etf_close','adj Close':'etf_adj_close','volume':'etf_volume'})
dowjones_df = dowjones_df.rename(columns={'date':'dowjones_date','open':'dowjones_open','high':'dowjones_high',
                                     'low':'dowjones_low','close':'dowjones_close','adj close':'dowjones_adj_close',
                                     'volume':'dowjones_volume'})


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


#### Select Top 10 Leader Data

In [3]:
lead_ptp_bids = ptp_bids_df.loc[ptp_bids_df['qse_name'].isin(['QLUMN','QNRGTX','QDCENG','QREUEL','QSHELL',
                                                              'QDIRE','QPREC','QMONT','QWOLFP','QTIOS'])]
lead_ptp_awards = ptp_awards_df.loc[ptp_awards_df['qse_name'].isin(['QLUMN','QNRGTX','QDCENG','QREUEL','QSHELL',
                                                                    'QDIRE','QPREC','QMONT','QWOLFP','QTIOS'])]

#Select Data within timeline presented in Jeff's Power BI Dashboard (Jan 3, 2019 - July 12, 2019)
lead_ptp_awards['date'] = pd.to_datetime(lead_ptp_awards['delivery_date'])
mask = (lead_ptp_awards['date'] >= '2019-01-02') & (lead_ptp_awards['date'] < '2019-07-13')
lead_ptp_awards = lead_ptp_awards.loc[mask]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


## Join Market Data, LMP, & Leader Energy Award Tables = join_energy_df

In [6]:
#Join Market Condition & Location Pricing Data
market_lmp = lmp_df.merge(market_df, how = 'left', on = ['marketday','hourending','peaktype','month','year'])
market_lmp = market_lmp.drop(['datetime_y','year'],axis=1)
market_lmp = market_lmp.rename(columns={'datetime_x':'datetime'})
market_lmp_nodes = market_lmp.merge(nodes_df, how='left', left_on='settlementpoint', right_on='nodename')
lead_ptp_awards = lead_ptp_awards.rename(columns={'delivery_date':'marketday',
                                                        'hour_ending':'hourending',
                                                        'settlement_point_source':'settlementpoint_src',
                                                        'settlement_point_sink':'settlementpoint_snk'})

#Join PTP Obligation Bid Award df with joined Market/Price df
awards_Src = lead_ptp_awards.merge(market_lmp_nodes, how = 'left', 
                                          left_on = ['marketday','hourending','settlementpoint_src'],
                                          right_on = ['marketday','hourending','settlementpoint'])
awards_Snk = lead_ptp_awards.merge(market_lmp_nodes, how = 'left', 
                                          left_on = ['marketday','hourending','settlementpoint_snk'],
                                          right_on = ['marketday','hourending','settlementpoint'])
model_dta = pd.concat([awards_Src, awards_Snk])
model_dta = model_dta.drop(columns=['iso','weatherstationid','first_dart_date','last_dart_date','equipment','voltage',
                                    'substation','nodetype','zoneid','nodename','objectid','date','datetime'])
model_dta.loc[(model_dta.nearest_weatherstation == 'TM - Nuevo Laredo/Intl'),'nearest_weatherstation']='TX - Laredo/Intl'

In [7]:
stocks_df = nasdaq_df.merge(etf_df, how='left',left_on='nasdaq_date',right_on='etf_date')
stocks_df = stocks_df.merge(dowjones_df, how='left',left_on='nasdaq_date',right_on='dowjones_date')
stocks_df = stocks_df.drop(columns=['dowjones_date','etf_date'])
stocks_df = stocks_df.rename(columns={'nasdaq_date':'date'})
stocks_df['date'] = pd.to_datetime(stocks_df['date'],infer_datetime_format=True)
stocks_df['date'] = stocks_df['date'].dt.strftime('%m/%d/%Y')

In [8]:
weather_df = pd.concat([weather_df_1_trim, weather_df_2_trim,weather_df_3_trim])
weather_df['date'] = pd.to_datetime(weather_df['date'],infer_datetime_format=True)
weather_df['hourending'] = [d.time() for d in weather_df['date']]
mask = (weather_df['date'] >= '2019-01-02') & (weather_df['date'] < '2019-07-13')
weather_df = weather_df.loc[mask]
hours = [math.ceil((t.hour * 60 + t.minute) / 60) for t in weather_df['hourending']]
weather_df['hour'] = hours
weather_df['hour']= weather_df['hour'].apply(str).apply(int)
weather_df = weather_df.loc[(weather_df['hour'] > 0)]
weather_df['date'] = weather_df['date'].dt.strftime('%m/%d/%Y')
c_maxes = weather_df.groupby(['station', 'date','hour']).hourending.transform(max)
weather_df = weather_df.loc[weather_df.hourending == c_maxes]
weather_df['station'] = weather_df['station'].map({72267023042: 'TX - Lubbock/Intl',
                                                                 72251012924: 'TX - Corpus Christi/Intl',
                                                                 72266013962: 'TX - Abilene/Municipal', 
                                                                 72250012919: 'TX - Brownsville/Intl', 
                                                                 72351013966: 'TX - Wichita Falls/Sheppard AFB',
                                                                 72261022010: 'TX - Del Rio/Intl',
                                                                 72265023023: 'TX - Midland-Odessa',
                                                                 72253012921: 'TX - San Antonio/Intl',
                                                                 72363023047: 'TX - Amarillo/Intl',
                                                                 72248013957: 'LA - Shreveport/Regional',
                                                                 72263023034: 'TX - San Angelo/Mathis',
                                                                 72265623040: 'TX - Wink/Winkler County',
                                                                 72258013960: 'TX - Dallas/Love Field',
                                                                 72243012960: 'TX - Houston/Intercontinental',
                                                                 72261823091: 'TX - Fort Stockton',
                                                                 72252012907: 'TX - Laredo/Intl',
                                                                 74641013975: 'OK - Gage/Shattuck',
                                                                 72259303985: 'TX - Dallas-Fort Worth/Intl'})


In [9]:
weather_df = weather_df.drop(columns='hourending')
weather_df=weather_df.drop_duplicates(keep='first')

In [10]:
stocks_weather_df = weather_df.merge(stocks_df, how = 'left', on='date')
stocks_weather_df = stocks_weather_df.fillna(0.00)
stocks_weather_df=stocks_weather_df.drop_duplicates(keep='first')

In [11]:
stocks_weather_df.shape

(124992, 31)

In [12]:
model_dta.shape

(9712380, 25)

In [13]:
data = model_dta.merge(stocks_weather_df.drop_duplicates(['date','hour','station']), 
                       how = 'left', 
                       left_on = ['marketday','hourending','nearest_weatherstation'], 
                       right_on=['date','hour','station'])


In [14]:
data.shape

(9712380, 56)

### Create Evaluation Criterion: PnL

In [None]:
#Create PnL column for Performance Measurement/Evaluation Criterion
join_ptp_df['PnL'] = (join_ptp_df.dalmp-join_ptp_df.rtlmp) * join_ptp_df.ptp_bid_award_mv

#Separate Leaders into dataframes for separate modeling
leaders_QLUMN = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QLUMN']
leaders_QNRGTX = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QNRGTX']
leaders_QDCENG = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QDCENG']
leaders_QREUEL = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QREUEL']
leaders_QSHELL = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QSHELL']
leaders_QDIRE = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QDIRE']
leaders_QPREC = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QPREC']
leaders_QMONT = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QMONT']
leaders_QWOLFP = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QWOLFP']
leaders_QTIOS = join_ptp_df.loc[join_ptp_df['qse_name'] == 'QTIOS']


In [None]:
leaders_QLUMN['PnL'].sum() #PnL: 4,938,417.14
leaders_QNRGTX['PnL'].sum() #2,245,426.25
leaders_QDCENG['PnL'].sum() #1,370,061.43
leaders_QREUEL['PnL'].sum() #1,344,345.49
PnL = leaders_QSHELL['PnL'].sum() #1,322,784.03
leaders_QDIRE['PnL'].sum() #1,228,761.44
leaders_QPREC['PnL'].sum() #1,024,767.48
leaders_QMONT['PnL'].sum() #979,167.64
leaders_QWOLFP['PnL'].sum() #958,999.75
leaders_QTIOS['PnL'].sum() #790,331.15
PnL 


# Model Test 1: Logistic Regression

In [None]:
#To perform logistic regression we use the glm method (generalized linear model)

#If the encoding is not performed, the glm algorithm encodes the first occerance as 1 and all else as 0
# Generally it is good to encode it yourself, so that you know what the output means
formula = 'PnL ~ ercot_wind_stwpf_orig + ercot_wind_stwpf_orig/ercot_original_load_forecast + ercot_original_load_forecast'
fit1 = smf.glm(formula=formula,data = join_energy_df,family=sm.families.Binomial()).fit()
print(fit1.summary())

In [None]:
plt.plot(df_data['cal_q'],fit1.fittedvalues,'o');
plt.xlabel('Calendar Quarter');
plt.ylabel('Probability of Cross Purchase');