In [None]:
from pyspark.sql import SparkSession
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'secrets/serviceKey.json'


# JAR paths for BigQuery and GCS connectors
bigquery_connector_jar = "spark-bigquery-connector.jar"
gcs_connector_jar = "gcs-connector.jar"


# Create SparkSession with both connectors
spark = SparkSession.builder \
    .appName("PySpark with BigQuery and GCS") \
    .config("spark.jars", f"{bigquery_connector_jar},{gcs_connector_jar}") \
    .config("spark.sql.catalog.spark_bigquery", "com.google.cloud.spark.bigquery.BigQueryCatalog") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "secrets/serviceKey.json") \
    .config("spark.bigquery.projectId", "idmpproject-441123") \
    .getOrCreate()

spark

In [None]:
# Specify the BigQuery table
project_id = "idmpproject-441123"
dataset_id = "uberFareEstimation"
table_name = "uber_data"

bigquery_table = f"{project_id}.{dataset_id}.{table_name}"

# Read data from BigQuery into a Spark DataFrame
df = spark.read \
    .format("bigquery") \
    .option("table", bigquery_table) \
    .load()

# Show the first few rows of the DataFrame
df.show()


In [None]:
df.printSchema()

In [None]:
# Get number of rows
num_rows = df.count()

# Get number of columns
num_columns = len(df.columns)

# Print the shape
print(f"Number of rows: {num_rows}, Number of columns: {num_columns}")


In [None]:
df_filtered = spark.read \
    .format("bigquery") \
    .option("query", """
    SELECT * 
    FROM `idmpproject-441123.uberFareEstimation.uber_data`
    WHERE TRIM(LOWER(destination)) = 'north end'
    """) \
    .load()

In [None]:
df_filtered1 = df.filter((df["destination"] == "North End") & (df["price"] > 50))

In [None]:
df_filtered1.show()