In [21]:
from pyspark.sql import SparkSession
import os

spark = SparkSession \
        .builder \
        .appName('Access GCS') \
        .getOrCreate()

spark._jsc.hadoopConfiguration() \
    .set("google.cloud.auth.service.account.json.keyfile","/.google/credentials/google_credentials_project.json")


BUCKET = os.getenv('GCP_GCS_BUCKET')




Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/10 19:47:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
def load_to_bigquery(table, date):
    path=f"gs://{BUCKET}/BigQuery/{table}-{date}/*.parquet"
    df = spark.read.parquet(path, header = True)
    df.write.format('bigquery') \
      .option('table', 'wordcount_dataset.wordcount_output') \
      .save()

In [13]:
from google.cloud import storage
import pandas as pd
import os
import subprocess

# Initialise a client
storage_client = storage.Client("DE-stack-overflow")
# Create a bucket object for our bucket
bucket = storage_client.get_bucket('dtc_data_lake_de-stack-overflow')
# all files in the bucket 
files = list(bucket.list_blobs())
files = [blob.name for blob in files if 'BigQuery/' in blob.name]
files = [file for file in files if '.parquet' in file]

In [39]:
tables = ['badges', 'posts_questions', 'posts_answers', 'users']

In [45]:
total = 0
for table in tables:
    uris = [f'gs://{BUCKET}/'+ file for file in files if table in file]
    total += len(uris)/4
    print(table, len(uris)/4)
total

badges 164.0
posts_questions 165.0
posts_answers 164.5
users 165.0


658.5

In [50]:
path=rf"gs://{BUCKET}/BigQuery/posts_questions-*/*.parquet"
df = spark.read.parquet(path, header = True)

                                                                                

In [52]:
df.createOrReplaceTempView('posts_questions')

In [56]:
# What is the percentage of questions that have been answered over the years?
spark.sql('''
SELECT
  EXTRACT(YEAR FROM creation_date) AS Year,
  COUNT(*) AS Number_of_Questions,
  ROUND(100 * SUM(IF(answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers
FROM
  posts_questions
GROUP BY
  Year
ORDER BY
  Year
    ''').show()



+----+-------------------+------------------------------+
|Year|Number_of_Questions|Percent_Questions_with_Answers|
+----+-------------------+------------------------------+
|2008|              57755|                          99.9|
|2009|             342048|                          99.6|
|2010|             463005|                          99.1|
|2011|             573539|                          97.3|
|2012|            1441630|                          94.9|
|2013|            1409097|                          92.4|
|2014|            1566938|                          88.9|
|2015|            1982395|                          87.0|
|2016|            2056943|                          85.3|
|2017|            2118252|                          83.8|
|2018|            1891231|                          82.6|
|2019|            1769797|                          82.2|
|2020|            1605271|                          81.0|
|2021|            1557939|                          66.3|
|2022|        

                                                                                

In [90]:
path_users=rf"gs://{BUCKET}/BigQuery/users-*/*.parquet"
df_users = spark.read.parquet(path_users, header = True)
path_badges=rf"gs://{BUCKET}/BigQuery/badges-*/*.parquet"
df_badges = spark.read.parquet(path_badges, header = True)
path_answers=rf"gs://{BUCKET}/BigQuery/posts_answers-*/*.parquet"
df_answers = spark.read.parquet(path_answers, header = True)

                                                                                

In [91]:
df_users.createOrReplaceTempView('users')
df_badges.createOrReplaceTempView('badges')
df_answers.createOrReplaceTempView('posts_answers')

In [80]:
# What is the reputation and badge count of users across different tenures on StackOverflow?
spark.sql('''
SELECT user_Tenure,
       COUNT(1) AS Num_Users,
       ROUND(AVG(reputation)) AS Avg_Reputation,
       ROUND(AVG(num_badges)) AS Avg_Num_Badges
FROM (
  SELECT users.id AS user,
          ROUND(EXTRACT(DAY FROM CURRENT_TIMESTAMP()-MIN(users.creation_date))/365) AS user_tenure,
         MIN(users.reputation) AS reputation,
         SUM(IF(badges.user_id IS NULL, 0, 1)) AS num_badges
  FROM users
  LEFT JOIN badges
  ON users.id = badges.user_id
  GROUP BY user
)
GROUP BY User_Tenure
ORDER BY User_Tenure
    ''').show()

22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:31:51 WARN RowBasedKeyValueBatch: Calling spill() on

22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:15 WARN RowBasedKeyValueBatch: Calling spill() on

22/04/10 21:32:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:32:46 WARN RowBasedKeyValueBatch: Calling spill() on

+-----------+---------+--------------+--------------+
|user_Tenure|Num_Users|Avg_Reputation|Avg_Num_Badges|
+-----------+---------+--------------+--------------+
|        0.0|  1246396|           2.0|           0.0|
|        1.0|  2609749|           4.0|           1.0|
|        2.0|  2120847|           8.0|           1.0|
|        3.0|  1665536|          14.0|           1.0|
|        4.0|  1664136|          21.0|           1.0|
|        5.0|  1704799|          30.0|           2.0|
|        6.0|  1442345|          50.0|           2.0|
|        7.0|  1226532|          76.0|           3.0|
|        8.0|  1169934|         108.0|           4.0|
|        9.0|  1053102|         175.0|           5.0|
|       10.0|   574246|         413.0|           9.0|
|       11.0|   330209|         784.0|          14.0|
|       12.0|   178045|        1395.0|          20.0|
|       13.0|    53740|        4832.0|          49.0|
|       14.0|    13806|       10498.0|          76.0|
+-----------+---------+-----

                                                                                

In [82]:
# What are 10 of the “easier” gold badges to earn?
spark.sql('''
SELECT badge_name AS First_Gold_Badge, 
       COUNT(1) AS Num_Users,
       ROUND(AVG(tenure_in_days)) AS Avg_Num_Days
FROM
(
  SELECT 
    badges.user_id AS user_id,
    badges.name AS badge_name,
    EXTRACT(DAY FROM badges.date - users.creation_date) AS tenure_in_days,
    ROW_NUMBER() OVER (PARTITION BY badges.user_id
                       ORDER BY badges.date) AS row_number
  FROM 
    badges
  JOIN
    users
  ON badges.user_id = users.id
  WHERE badges.class = 1 
) 
WHERE row_number = 1
GROUP BY First_Gold_Badge
ORDER BY Num_Users DESC
LIMIT 10
    ''').show()



+----------------+---------+------------+
|First_Gold_Badge|Num_Users|Avg_Num_Days|
+----------------+---------+------------+
| Famous Question|   392661|      1587.0|
|         Fanatic|    26818|       848.0|
|    Great Answer|    23796|      1921.0|
|     Unsung Hero|    20557|       811.0|
|      Electorate|    11631|      1201.0|
|        Populist|    10509|      1680.0|
|       Publicist|     2515|      2386.0|
|         Steward|     1547|      1199.0|
|  Great Question|      710|       875.0|
|     Copy Editor|      342|       743.0|
+----------------+---------+------------+



                                                                                

In [92]:
# Which day of the week has most questions answered within an hour?
spark.sql('''
SELECT
  question_day,
  COUNT(answer_id) as count_answers,
  COUNT(question_id) as count_questions,
  ROUND(COUNT(answer_id)/COUNT(question_id)*100,2) as percent_questions
FROM
  (SELECT
    EXTRACT(DAYOFWEEK from posts_questions.creation_date) as question_day,
    posts_questions.accepted_answer_id as answer_id,
    posts_questions.id as question_id
   FROM 
    posts_questions
   FULL JOIN
    posts_answers ON posts_questions.id = posts_answers.parent_id
   WHERE
     posts_answers.creation_date < posts_questions.creation_date + INTERVAL '1 hour')
GROUP BY
  question_day
ORDER BY 
  question_day
    ''').show()

22/04/10 21:53:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:09 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/04/10 21:53:12 WARN RowBasedKeyValueBatch: Calling spill() on

+------------+-------------+---------------+-----------------+
|question_day|count_answers|count_questions|percent_questions|
+------------+-------------+---------------+-----------------+
|           1|       800224|        1174080|            68.16|
|           2|      1528772|        2260068|            67.64|
|           3|      1685876|        2491307|            67.67|
|           4|      1725322|        2547977|            67.71|
|           5|      1704026|        2518918|            67.65|
|           6|      1516098|        2241172|            67.65|
|           7|       792544|        1167582|            67.88|
+------------+-------------+---------------+-----------------+



                                                                                