In [36]:
import requests
import pandas as pd
import seaborn as sns
from pyspark.sql import functions as F
from pyspark.sql.functions import col


In [37]:
import json

with open('secrets.json', 'r') as file:
    secret_data = json.load(file)



In [38]:
api_data_start = "2020-01-01"

api_key_arg = "&api_key=" + secret_data.get("api_key")
api_start_arg = "&start=" + api_data_start

In [39]:
###data_endpoint = "https://api.eia.gov/v2/electricity/rto/daily-region-sub-ba-data/data/?frequency=daily&data[0]=value&start=2025-03-15&sort[0][column]=period&sort[0][direction]=desc&offset=0"

In [40]:
data_endpoint = "https://api.eia.gov/v2/electricity/rto/daily-region-data/data/?frequency=daily&data[0]=value&facets[respondent][]=NY&facets[timezone][]=Eastern&facets[type][]=D&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000"

In [41]:

response = requests.get(data_endpoint+api_start_arg+api_key_arg)

if response.status_code == 200:
    data = response.json()
    print("request succeeded")

else:
    print(f"request failed, code:  {response.status_code}")

request succeeded


In [42]:
raw_df = pd.DataFrame(data.get("response").get("data"))


In [43]:
raw_df.to_csv('raw-data.csv')

In [45]:
from pyspark.sql import SparkSession
import os

os.system("hdfs dfs -put -f ./raw-data.csv /staging/")


spark = SparkSession.builder \
    .appName("Power Demand Tracker") \
    .getOrCreate()

df_spark = spark.read.csv('hdfs:///staging/raw-data.csv', header=True, inferSchema=True)

df_spark.show()



+---+----------+----------+---------------+----+---------+--------+--------------------+------+-------------+
|_c0|    period|respondent|respondent-name|type|type-name|timezone|timezone-description| value|  value-units|
+---+----------+----------+---------------+----+---------+--------+--------------------+------+-------------+
|  0|2025-03-30|        NY|       New York|   D|   Demand| Eastern|             Eastern|360251|megawatthours|
|  1|2025-03-29|        NY|       New York|   D|   Demand| Eastern|             Eastern|356584|megawatthours|
|  2|2025-03-28|        NY|       New York|   D|   Demand| Eastern|             Eastern|373815|megawatthours|
|  3|2025-03-27|        NY|       New York|   D|   Demand| Eastern|             Eastern|371568|megawatthours|
|  4|2025-03-26|        NY|       New York|   D|   Demand| Eastern|             Eastern|383131|megawatthours|
|  5|2025-03-25|        NY|       New York|   D|   Demand| Eastern|             Eastern|378674|megawatthours|
|  6|2025-

25/03/31 22:13:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , period, respondent, respondent-name, type, type-name, timezone, timezone-description, value, value-units
 Schema: _c0, period, respondent, respondent-name, type, type-name, timezone, timezone-description, value, value-units
Expected: _c0 but found: 
CSV file: hdfs://localhost:9000/staging/raw-data.csv


In [46]:


filtered_spark_df = df_spark.select(
    col("period").alias("Date"),
    col("value").alias("MWh")
)

In [47]:
filtered_spark_df.show()

+----------+------+
|      Date|   MWh|
+----------+------+
|2025-03-30|360251|
|2025-03-29|356584|
|2025-03-28|373815|
|2025-03-27|371568|
|2025-03-26|383131|
|2025-03-25|378674|
|2025-03-24|398827|
|2025-03-23|351612|
|2025-03-22|354942|
|2025-03-21|360106|
|2025-03-20|365904|
|2025-03-19|354325|
|2025-03-18|367109|
|2025-03-17|383316|
|2025-03-16|350585|
|2025-03-15|353508|
|2025-03-14|362794|
|2025-03-13|374367|
|2025-03-12|380035|
|2025-03-11|360839|
+----------+------+
only showing top 20 rows



In [15]:
#df = raw_df[['period', 'value']]

In [52]:
features_df = filtered_spark_df.withColumn("Day", F.dayofweek("Date"))
features_df = features_df.withColumn("Month", F.month("Date"))
features_df.show()

+----------+------+---+-----+
|      Date|   MWh|Day|Month|
+----------+------+---+-----+
|2025-03-30|360251|  1|    3|
|2025-03-29|356584|  7|    3|
|2025-03-28|373815|  6|    3|
|2025-03-27|371568|  5|    3|
|2025-03-26|383131|  4|    3|
|2025-03-25|378674|  3|    3|
|2025-03-24|398827|  2|    3|
|2025-03-23|351612|  1|    3|
|2025-03-22|354942|  7|    3|
|2025-03-21|360106|  6|    3|
|2025-03-20|365904|  5|    3|
|2025-03-19|354325|  4|    3|
|2025-03-18|367109|  3|    3|
|2025-03-17|383316|  2|    3|
|2025-03-16|350585|  1|    3|
|2025-03-15|353508|  7|    3|
|2025-03-14|362794|  6|    3|
|2025-03-13|374367|  5|    3|
|2025-03-12|380035|  4|    3|
|2025-03-11|360839|  3|    3|
+----------+------+---+-----+
only showing top 20 rows



In [48]:

#features_df.write.mode("overwrite").option("header", True).csv("hdfs:///unscaled/")

In [51]:
#df_spark_test = spark.read.option("header", True).option("inferSchema", True).csv("hdfs:///unscaled/")
#df_spark_test.show()


##hdfs dfs -getmerge /unscaled/ ./unscaled-data.csv

In [23]:
## RNN

In [24]:
#RNN_df = features_df.select("Date", "MWh")

In [74]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DoubleType

def scale_features(df):
    this_df = df
    
    for index, column in enumerate(df.columns):
        
        if column != "Date":
        
            assembler = VectorAssembler(inputCols=[column], outputCol="feature_"+column)

            df_vec = assembler.transform(this_df)
            
            print(df_vec)


            scaler = MinMaxScaler(inputCol="feature_"+column, outputCol="scaled_"+column)
            scaler_model = scaler.fit(df_vec)
            scaled_df = scaler_model.transform(df_vec)


            extract_element = udf(lambda v: float(v[0]), DoubleType())
            
            print("scaled_"+column)

            this_df = scaled_df.withColumn("scaled_"+column, extract_element(col("scaled_"+column)))

    return this_df


In [75]:
scaled_features_df = scale_features(features_df)

DataFrame[Date: date, MWh: int, Day: int, Month: int, feature_MWh: vector]
scaled_MWh
DataFrame[Date: date, MWh: int, Day: int, Month: int, feature_MWh: vector, scaled_MWh: double, feature_Day: vector]
scaled_Day
DataFrame[Date: date, MWh: int, Day: int, Month: int, feature_MWh: vector, scaled_MWh: double, feature_Day: vector, scaled_Day: double, feature_Month: vector]
scaled_Month


In [77]:
filtered_scaled_feature = scaled_features_df.select("Date", "scaled_MWh", "scaled_Day", "scaled_Month")

In [78]:
filtered_scaled_feature.show()

[Stage 61:>                                                         (0 + 1) / 1]

+----------+-------------------+-------------------+-------------------+
|      Date|         scaled_MWh|         scaled_Day|       scaled_Month|
+----------+-------------------+-------------------+-------------------+
|2025-03-30|0.16638138061417404|                0.0|0.18181818181818182|
|2025-03-29|0.15467081395942978|                1.0|0.18181818181818182|
|2025-03-28|0.20969802258443618| 0.8333333333333333|0.18181818181818182|
|2025-03-27|0.20252222676408974| 0.6666666666666666|0.18181818181818182|
|2025-03-26|0.23944867405855605|                0.5|0.18181818181818182|
|2025-03-25|0.22521524193960452| 0.3333333333333333|0.18181818181818182|
|2025-03-24|0.28957385928159013|0.16666666666666666|0.18181818181818182|
|2025-03-23|0.13879272903786216|                0.0|0.18181818181818182|
|2025-03-22|0.14942708599458382|                1.0|0.18181818181818182|
|2025-03-21|0.16591832302897144| 0.8333333333333333|0.18181818181818182|
|2025-03-20|0.18443423943590007| 0.6666666666666666

                                                                                

In [81]:
filtered_scaled_feature.write.mode("overwrite").option("header", True).csv("hdfs:///transformed/")

                                                                                

In [None]:
########## hdfs dfs -getmerge /transformed/ ./transformed.csv

In [None]:
df_spark_test = spark.read.option("header", True).option("inferSchema", True).csv("hdfs:///transformed/")
df_spark_test.show()

In [35]:
#RNN_df_scaled.write.mode("overwrite").option("header", True).csv("hdfs:///transformed/")

                                                                                

In [None]:
########## hdfs dfs -getmerge /transformed/ ./transformed.csv

To install keras and tensorflow


sudo apt update
sudo apt install python3-venv python3-pip -y

python3 -m venv keras-env
source keras-env/bin/activate

pip install --upgrade pip
pip install keras tensorflow


source keras-env/bin/activate
source /home/hduser/Documents/GitHub/CCT-SEM2-CA1/SEM2-CA--HADOOP-SPARK/keras-env/bin/activate



pip install jupyter ipykernel

python -m ipykernel install --user --name=keras-env --display-name="Python (keras-env)"

In [None]:
# Spark Session, Pipeline, Functions, and Metrics
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics






##pip install keras tensorflow


In [None]:
# Keras / Deep Learning
#from keras.models import Sequential
#from keras.layers.core import Dense, Dropout, Activation
#from keras import optimizers, regularizers
#from keras.optimizers import Adam


In [None]:
# Elephas for Deep Learning on Spark
#from elephas.ml_model import ElephasEstimator

In [None]:
#df.dtypes

In [None]:
#df['value'] = pd.to_numeric(df['value'], errors='coerce')
#df['period'] = pd.to_datetime(df['period'])
#df.set_index('period', inplace=True)

In [None]:
#import matplotlib.pyplot as plt
#import matplotlib.dates as mdates



#fig, ax = plt.subplots()
#df.plot(ax=ax)

#ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))

#ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

#plt.xticks(rotation=45)
#plt.tight_layout()
#plt.show()


In [None]:
#sns.pairplot(df)

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()

In [None]:
#df.shape

In [None]:
#test_df = df.copy()

In [None]:
#training_data = test_df.iloc[:,0].values

In [None]:
#training_data = scaler.fit_transform(training_data.reshape(-1, 1))

In [None]:
#scaler.fit(df)
#scaled_data = scaler.transform(df)

In [None]:
#sns.pairplot(scaled_data)

In [None]:
#training_data.shape

In [None]:
#new_test_df = pd.DataFrame(training_data)

In [None]:
#sns.pairplot(new_test_df)

Save Data to HDFS

In [None]:
## Save df as csv

##df.to_csv('test-data.csv')

In [None]:
## upload to Hadoop

#import os

#os.system("hdfs dfs -put -f ./test-data.csv /testdata/")


In [None]:
#

In [None]:
#from pyspark.sql import SparkSession
#import os

#os.system("hdfs dfs -put -f ./test-data.csv /testdata/")


#park = SparkSession.builder \
   # .appName("Power Demand Tracker") \
 #  .getOrCreate()

#df_spark = spark.read.csv('hdfs:///testdata/test-data.csv', header=True, inferSchema=True)

#df_spark.show()

In [None]:
#df_spark