In [1]:
pip install google-cloud-bigquery

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql.functions import col, min, max, lag, lead, to_json
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

In [3]:
from pyspark.sql import SparkSession

service_account_path = "creds.json"

project_id = "bigdata-421623"

spark = SparkSession.builder \
    .appName("BigQuery Integration") \
    .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.21.0") \
    .config("credentialsFile", service_account_path) \
    .config("parentProject", project_id) \
    .getOrCreate()



:: loading settings :: url = jar:file:/opt/conda/envs/bigdata/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
com.google.cloud.spark#spark-bigquery-with-dependencies_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-418d9dbe-dce7-4c93-8334-d30887956313;1.0
	confs: [default]
	found com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.21.0 in central
:: resolution report :: resolve 399ms :: artifacts dl 14ms
	:: modules in use:
	com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.21.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	----------------------------------------------------------------

In [13]:
table = "bigdata-421623.ForEx_Big_Data.Hourly_Forex"

df = spark.read \
    .format("bigquery") \
    .option("table", table) \
    .load()

In [14]:
df.printSchema()

root
 |-- closing_price: double (nullable = true)
 |-- highest_price: double (nullable = true)
 |-- lowest_price: double (nullable = true)
 |-- transactions: long (nullable = true)
 |-- opening_price: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- volume: long (nullable = true)
 |-- avg_volume_weight: double (nullable = true)
 |-- ticker: string (nullable = true)
 |-- created_at: timestamp (nullable = true)



In [15]:
tickers = df.select("ticker").distinct().rdd.flatMap(lambda x: x).collect()

In [16]:
print(tickers)

['C:JPYUSD', 'C:USDEUR', 'C:EURUSD', 'C:GBPEUR', 'C:USDJPY', 'C:GBPUSD', 'C:EURGBP', 'C:JPYEUR', 'C:EURJPY', 'C:JPYGBP', 'C:GBPJPY', 'C:USDGBP']


In [17]:
df.show(5)

+-------------+-------------+------------+------------+-------------+-------------------+------+-----------------+--------+--------------------+
|closing_price|highest_price|lowest_price|transactions|opening_price|               time|volume|avg_volume_weight|  ticker|          created_at|
+-------------+-------------+------------+------------+-------------+-------------------+------+-----------------+--------+--------------------+
|      1.05685|      1.05772|      1.0564|        6162|      1.05765|2023-02-28 19:00:00|  6162|           1.0568|C:EURUSD|2024-04-28 12:04:...|
|      1.05841|       1.0586|      1.0566|        7986|      1.05684|2023-02-28 20:00:00|  7986|           1.0576|C:EURUSD|2024-04-28 12:04:...|
|      1.05831|      1.05866|       1.058|        4862|      1.05848|2023-02-28 21:00:00|  4862|           1.0584|C:EURUSD|2024-04-28 12:04:...|
|      1.05929|       1.0596|      1.0581|        5067|       1.0583|2023-02-28 22:00:00|  5067|            1.059|C:EURUSD|2024-04

In [18]:
timestamp_extremes = df.select(
    min("time").alias("Earliest Timestamp"),
    max("time").alias("Latest Timestamp")
)

In [19]:
timestamp_extremes.show()

[Stage 8:>                                                          (0 + 1) / 1]

+-------------------+-------------------+
| Earliest Timestamp|   Latest Timestamp|
+-------------------+-------------------+
|2023-02-28 19:00:00|2024-04-26 17:00:00|
+-------------------+-------------------+



                                                                                

# Working on the Top 3 Tickers for Y2024

In [20]:
USDEUR_df = df.filter(col("ticker") == 'C:USDEUR')
USDJPY_df = df.filter(col("ticker") == 'C:USDJPY')
USDGBP_df = df.filter(col("ticker") == 'C:USDGBP')

In [23]:
features = ["closing_price", "highest_price", "lowest_price", "opening_price", "volume", "avg_volume_weight"]

In [28]:
assembler = VectorAssembler(inputCols=features, outputCol="features_vector")
scaler = StandardScaler(inputCol="features_vector", outputCol="scaledFeatures", withMean=True, withStd=True)

In [29]:
pipeline = Pipeline(stages=[assembler, scaler])

## Ticker USDGBP

In [26]:
USDGBP_df.show(10)

+-------------+-------------+------------+------------+-------------+-------------------+------+-----------------+--------+--------------------+
|closing_price|highest_price|lowest_price|transactions|opening_price|               time|volume|avg_volume_weight|  ticker|          created_at|
+-------------+-------------+------------+------------+-------------+-------------------+------+-----------------+--------+--------------------+
|      0.83218|      0.83231|   0.8309098|        6601|        0.831|2023-02-28 19:00:00|  6601|           0.8317|C:USDGBP|2024-04-28 12:12:...|
|       0.8309|      0.83222|      0.8306|        9253|      0.83215|2023-02-28 20:00:00|  9253|           0.8314|C:USDGBP|2024-04-28 12:12:...|
|      0.83151|       0.8322|     0.83076|        3348|      0.83096|2023-02-28 21:00:00|  3348|           0.8315|C:USDGBP|2024-04-28 12:12:...|
|      0.83057|      0.83166|      0.8303|        3407|      0.83152|2023-02-28 22:00:00|  3407|           0.8309|C:USDGBP|2024-04

In [30]:
USDGBP_df = pipeline.fit(USDGBP_df).transform(USDGBP_df)

                                                                                

In [31]:
USDGBP_df.printSchema()

root
 |-- closing_price: double (nullable = true)
 |-- highest_price: double (nullable = true)
 |-- lowest_price: double (nullable = true)
 |-- transactions: long (nullable = true)
 |-- opening_price: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- volume: long (nullable = true)
 |-- avg_volume_weight: double (nullable = true)
 |-- ticker: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- features_vector: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [32]:
USDGBP_df.select("scaledFeatures").show(1)

+--------------------+
|      scaledFeatures|
+--------------------+
|[2.24855329163409...|
+--------------------+
only showing top 1 row



                                                                                

In [33]:
scaledFeatures = USDGBP_df.select("scaledFeatures").collect()
for row in scaledFeatures:
    print(row.scaledFeatures)

[2.2485532916340967,2.2194192093348035,2.208854523993384,2.170771761905083,1.3591289815576844,2.2176304464174397]
[2.163069003084934,2.2134212539523235,2.1881030732322104,2.2475866715438566,2.454042876425777,2.1975866434880262]
[2.2038076093466428,2.2120883749784457,2.198820413844827,2.1680999389611304,0.016084147467976154,2.204267911131164]
[2.1410300849433583,2.176100642683572,2.1680080595835634,2.2055054601765343,0.040443091584573994,2.1641803052723296]
[2.1390265469304817,2.120119725780449,2.165998558218693,2.143385576729526,-0.5066009923898691,2.144136502342916]
[2.1350194709047434,2.1067909360416106,2.141214708052027,2.1320303292177063,0.04993895115545112,2.1307739670566406]
[2.081591790561518,2.1041251780938404,2.080929667106078,2.1367060193696257,0.2699951751240384,2.0906863611978057]
[1.9613795097892621,2.072802522207569,1.9670579230970713,2.079929781810537,1.4140398216849304,2.030554952409558]
[1.9567045877592364,1.9681715227576808,1.9536612473312993,1.9643734394843795,1.8908

In [34]:
windowSpec = Window.orderBy("time")

In [35]:
n_steps_in = 24 
for i in range(1, n_steps_in + 1):
    USDGBP_df = USDGBP_df.withColumn(f"scaledFeatures_t-{i}", lag(col("scaledFeatures"), i).over(windowSpec))

In [36]:
USDGBP_df.printSchema()

root
 |-- closing_price: double (nullable = true)
 |-- highest_price: double (nullable = true)
 |-- lowest_price: double (nullable = true)
 |-- transactions: long (nullable = true)
 |-- opening_price: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- volume: long (nullable = true)
 |-- avg_volume_weight: double (nullable = true)
 |-- ticker: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- features_vector: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)
 |-- scaledFeatures_t-1: vector (nullable = true)
 |-- scaledFeatures_t-2: vector (nullable = true)
 |-- scaledFeatures_t-3: vector (nullable = true)
 |-- scaledFeatures_t-4: vector (nullable = true)
 |-- scaledFeatures_t-5: vector (nullable = true)
 |-- scaledFeatures_t-6: vector (nullable = true)
 |-- scaledFeatures_t-7: vector (nullable = true)
 |-- scaledFeatures_t-8: vector (nullable = true)
 |-- scaledFeatures_t-9: vector (nullable = true)
 |-- scaledFeatures_t-10

In [37]:
n_steps_out = 3

for i in range(1, n_steps_out + 1):
    USDGBP_df = USDGBP_df.withColumn(f"price_t+{i}", lead(col("closing_price"), i).over(windowSpec))

In [38]:
USDGBP_df.printSchema()

root
 |-- closing_price: double (nullable = true)
 |-- highest_price: double (nullable = true)
 |-- lowest_price: double (nullable = true)
 |-- transactions: long (nullable = true)
 |-- opening_price: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- volume: long (nullable = true)
 |-- avg_volume_weight: double (nullable = true)
 |-- ticker: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- features_vector: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)
 |-- scaledFeatures_t-1: vector (nullable = true)
 |-- scaledFeatures_t-2: vector (nullable = true)
 |-- scaledFeatures_t-3: vector (nullable = true)
 |-- scaledFeatures_t-4: vector (nullable = true)
 |-- scaledFeatures_t-5: vector (nullable = true)
 |-- scaledFeatures_t-6: vector (nullable = true)
 |-- scaledFeatures_t-7: vector (nullable = true)
 |-- scaledFeatures_t-8: vector (nullable = true)
 |-- scaledFeatures_t-9: vector (nullable = true)
 |-- scaledFeatures_t-10

In [39]:
USDGBP_df.count()

8315

In [40]:
for i in range(1, n_steps_out + 1):
    USDGBP_df = USDGBP_df.filter(col(f"price_t+{i}").isNotNull())

In [41]:
USDGBP_df.count()

24/04/28 23:07:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

8312

In [42]:
for i in range(1, n_steps_in + 1):
    USDGBP_df = USDGBP_df.filter(col(f"scaledFeatures_t-{i}").isNotNull())

In [43]:
USDGBP_df.count()

24/04/28 23:07:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/04/28 23:07:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:07:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

8288

In [44]:
pandas_df = USDGBP_df.toPandas()

24/04/28 23:29:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:29:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:29:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:29:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:29:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [45]:
pandas_df.to_csv('usdgbp.csv', index=False)

## Ticker USDJPY

In [46]:
USDJPY_df = pipeline.fit(USDJPY_df).transform(USDJPY_df)

                                                                                

In [47]:
windowSpec = Window.orderBy("time")

In [48]:
n_steps_in = 24
for i in range(1, n_steps_in + 1):
    USDJPY_df = USDJPY_df.withColumn(f"scaledFeatures_t-{i}", lag(col("scaledFeatures"), i).over(windowSpec))

In [49]:
n_steps_out = 3
for i in range(1, n_steps_out + 1):
    USDJPY_df = USDJPY_df.withColumn(f"price_t+{i}", lead(col("closing_price"), i).over(windowSpec))

In [50]:
for i in range(1, n_steps_out + 1):
    USDJPY_df = USDJPY_df.filter(col(f"price_t+{i}").isNotNull())

for i in range(1, n_steps_in + 1):
    USDJPY_df = USDJPY_df.filter(col(f"scaledFeatures_t-{i}").isNotNull())

In [51]:
pandas_df = USDJPY_df.toPandas()

24/04/28 23:35:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:35:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:35:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:35:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:35:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [52]:
pandas_df.to_csv('usdjpy.csv', index=False)

## Ticker USDEUR

In [53]:
USDEUR_df = pipeline.fit(USDEUR_df).transform(USDEUR_df)

In [54]:
windowSpec = Window.orderBy("time")

In [55]:
n_steps_in = 24
for i in range(1, n_steps_in + 1):
    USDEUR_df = USDEUR_df.withColumn(f"scaledFeatures_t-{i}", lag(col("scaledFeatures"), i).over(windowSpec))

In [56]:
n_steps_out = 3
for i in range(1, n_steps_out + 1):
    USDEUR_df = USDEUR_df.withColumn(f"price_t+{i}", lead(col("closing_price"), i).over(windowSpec))

In [57]:
for i in range(1, n_steps_out + 1):
    USDEUR_df = USDEUR_df.filter(col(f"price_t+{i}").isNotNull())

for i in range(1, n_steps_in + 1):
    USDEUR_df = USDEUR_df.filter(col(f"scaledFeatures_t-{i}").isNotNull())

In [58]:
pandas_df = USDEUR_df.toPandas()

24/04/28 23:40:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:40:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:40:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:40:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/28 23:40:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [59]:
pandas_df.to_csv('usdeur.csv', index=False)