# README
<p>This Notebook serves to read in the raw data (stored in the DBFS) and write it to a data lake</p>

# Reading In The Data

In [0]:
import pyspark.sql.functions as F

In [0]:
#Importing data
df_bronze = (spark.read.format('json')
      .option("header","true") 
      .option('inferColumnTypes', True)
      .load('dbfs:/FileStore/tables/GHarchive/2022-01-14-*.json.gz')).limit(1000)

In [0]:
df_bronze = df_bronze.withColumnRenamed("id","header_id")
df_bronze = df_bronze.withColumnRenamed('created_at','header_created_at')

# Getting Data Ready For Writing

## Exploding

In [0]:
df_bronze = df_bronze.select("actor.*", "header_created_at", "header_id", "repo", "org", "payload", "public", "type")

In [0]:
df_bronze = df_bronze.withColumnRenamed("id","actor_id")
df_bronze = df_bronze.withColumnRenamed("url","actor_url")
df_bronze = df_bronze.withColumnRenamed("avatar_url","actor_avatar_url")
df_bronze = df_bronze.withColumnRenamed("gravatar_id","actor_gravatar_id")
df_bronze = df_bronze.withColumnRenamed("login","actor_login")

In [0]:
df_bronze = df_bronze.select('actor_avatar_url','display_login','actor_gravatar_id','actor_id','actor_login','actor_url','header_created_at','header_id','repo.*','org','payload','public','type')

In [0]:
df_bronze = df_bronze.withColumnRenamed('id','repo_id')
df_bronze = df_bronze.withColumnRenamed("url","repo_url")

In [0]:
df_bronze = df_bronze.select('actor_avatar_url','display_login','actor_gravatar_id','actor_id','actor_login','actor_url','header_created_at','header_id','repo_id','name','repo_url', 'org.*','payload','public','type')

In [0]:
df_bronze = df_bronze.withColumnRenamed('id','org_id')\
            .withColumnRenamed("url","org_url").withColumnRenamed('login','org_login') \
            .withColumnRenamed("avatar_url","org_avatar_url").withColumnRenamed('gravatar_id','org_gravatar_id')

## Solidifying Date for Partitioning

In [0]:
#created_at
#format
#2022-01-01T17:00:17Z
#T signifies start of time
#Z just means UTC

In [0]:
df_bronze = df_bronze.withColumn('date_created',F.from_unixtime(F.unix_timestamp('header_created_at', "yyyy-MM-dd'T'HH:mm:ss'Z'")))\
    .withColumn('day_created',F.dayofmonth('date_created'))

In [0]:
df_bronze.select('day_created').distinct().show()

+-----------+
|day_created|
+-----------+
|         14|
+-----------+



# Partitioning And Writing The Data To A Data Lake
Seperating days into 80-140 Mb partitions

In [0]:
bronze_path = "dbfs:/FileStore/JA/Bronze_layer_Test_2"

In [0]:
contname = 'team5-project2' #azure storage account container
#THIS WILL CHANGE BASED ON TEAM: 'team2-project2', 'team3-project2', etc.

storage_acct_name = '20230821desa'

client_id = 'de4ff859-02b1-4e2f-9d16-b578fa03df4f' #aka: app id

tenant_id = '33da9f3f-4c1a-4640-8ce1-3f63024aea1d' #aka: directory id

service_credential = dbutils.secrets.get(scope="databricks-app-kv",key="databricks-application")

In [0]:
df_bronze = df_bronze.repartition(25)
df_bronze.write.format('parquet')\
    .partitionBy('day_created')\
    .option("maxRecordsPerFile", 143110)\
    .mode('overwrite')\
    .save(f'{bronze_path}')