# README
<p>This noteboook aims to pull from the data lake and answer the use case aggregations as outlined in the Section labeled "Questions"</p>


# Imports

In [0]:
from pyspark.sql.functions import *


## Loading in Data

In [0]:
contname = 'team5-project2' #azure storage account container
#THIS WILL CHANGE BASED ON TEAM: 'team2-project2', 'team3-project2', etc.

storage_acct_name = '20230821desa'

client_id = 'de4ff859-02b1-4e2f-9d16-b578fa03df4f' #aka: app id

tenant_id = '33da9f3f-4c1a-4640-8ce1-3f63024aea1d' #aka: directory id

service_credential = dbutils.secrets.get(scope="databricks-app-kv",key="databricks-application")

gold_path = f'abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/GoldLayer'

In [0]:
action = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/factAction/part-*")).limit(100000)
action.createOrReplaceTempView('action')

In [0]:
timestamp = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/dimTimestamp/part-*"))
timestamp.createOrReplaceTempView('timestamp')

In [0]:
commit = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/dimCommits/part-*")).limit(100000)
commit.createOrReplaceTempView('commit')

In [0]:
linker_commits_action = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/linker_commits_action/part-*")).limit(100000)
linker_commits_action.createOrReplaceTempView('linker_commits_action')

In [0]:
pull_request = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/dimPullRequests/part-*")).limit(10000)
pull_request.createOrReplaceTempView('pull_request')

In [0]:
repo = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/dimRepos/part-*")).limit(100000)
repo.createOrReplaceTempView('repo')

In [0]:
repo_names = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/repo_names/part-*")).limit(100000)
repo_names.createOrReplaceTempView('repo_names')

In [0]:
user = (spark.read.format('parquet')
      .option("header","true")
      .option('inferColumnTypes', True)
      .load(f"{gold_path}/dimUser/part-*")).limit(100000)
user.createOrReplaceTempView('user')

# Questions

Once the data has been preprocessed in the silver layer, you have several aggregations that need to be done for the gold layer. Sparrow Analytics will load your gold layer into a Data Warehouse (and, eventually a BI tool) to provide insights into GitHub use patterns. Data loaded into the gold layer should adhere to a flat, star, or snowflake schema. You are tasked with the following aggregations: 

Data aggregated by type of GitHub event per hour 
PushEvent data aggregated by ref type – whether the commit is on the main branch 
Breakdown of events by type and number of commits per event 
User activity should be aggregated so that a filterable chart can be populated with breakdowns of user activity by week or month. 
Breakdown of activity by project – find a unique use case 
Challenge: Based on the commit messages – breakdown the events by language 
<ul>
  <li>Data aggregated by type of GitHub event per hour ?</li>
  <li>PushEvent data aggregated by ref type – whether the commit is on the main branch</li>
  <li>Breakdown of events by type and number of commits per event </li>
  <li>User activity should be aggregated so that a filterable chart can be populated with breakdowns of user activity by week or month. </li>
  <li>Breakdown of activity by project – find a unique use case </li>
  <li>Challenge: Based on the commit messages – breakdown the events by language </li>
</ul>

## Per Hour
<p>Group the number of Events per hour</P>

In [0]:
#answer
action.join(timestamp, action.time_id == timestamp.time_id).groupBy('year_created','day_created','hour_created','action_type').count().sort('hour_created',ascending=False).display()

year_created,day_created,hour_created,action_type,count
2022,1,23,PushEvent,16
2022,1,23,IssueCommentEvent,3
2022,1,23,CreateEvent,9
2022,1,23,WatchEvent,1
2022,1,23,PullRequestReviewEvent,2
2022,1,23,PullRequestEvent,3
2022,2,23,PushEvent,28
2022,2,23,DeleteEvent,1
2022,2,23,CreateEvent,6
2022,2,23,WatchEvent,1


## Main branch? (By Event Type)
<p>Determine whether or not the commit was on the main / master branch</p>

In [0]:
%sql
-- Grabs the pull_request_id and whether or not the base_name is on main from pull_request
-- Then joins action on pull_request_id.
-- Finally groups by the event (action)
SELECT a.action,
 count(t1.Bool) as Count_Per_Event_on_main
FROM (
  SELECT pull_request_id, 
    CASE WHEN pull_request_base_name LIKE '%master'  OR pull_request_base_name like '%main'
      THEN TRUE 
    ELSE FALSE
  END as Bool
  FROM pull_request
) t1
JOIN action a on t1.pull_request_id == a.action_pull_request_id
GROUP BY a.action

action,Count_Per_Event_on_main
closed,56
created,1675
reopened,4


## Breakdown of events by type and number of commits per event
<p>Counts the number of commits per event type</p>

In [0]:
#answer, they are all Push Events
commit_linked = commit.join(linker_commits_action,commit.commit_url == linker_commits_action.commit_url,'inner')
commit_linked.join(action, commit_linked.action_id == action.action_id, 'inner').groupBy('action_type').count().display()

action_type,count
PushEvent,23817936


## User activity should be aggregated so that a filterable chart can be populated with breakdowns of user activity by week or month

In [0]:
%sql
SELECT month_created, count(action.action_actor_id) FROM user JOIN action
ON user.user_id == action.action_actor_id
JOIN timestamp on timestamp.time_id == action.time_id
group by month_created

month_created,count(action_actor_id)
1,1479


In [0]:
%sql
SELECT number_of_week_created, count(action.action_actor_id) FROM user JOIN action
ON user.user_id == action.action_actor_id
JOIN timestamp on timestamp.time_id == action.time_id
group by number_of_week_created

number_of_week_created,count(action_actor_id)
1,340
52,58
3,335
5,48
4,371
2,327


In [0]:
%sql
SELECT month_created, number_of_week_created, count(t1.action_actor_id) 
FROM (user JOIN action
ON user.user_id == action.action_actor_id
JOIN timestamp on timestamp.time_id == action.time_id) t1
group by number_of_week_created, month_created

month_created,number_of_week_created,count(action_actor_id)
1,3,335
1,1,340
1,2,327
1,52,58
1,5,48
1,4,371


Databricks visualization. Run in Databricks to view.

## Breakdown of activity by project – find a unique use case

In [0]:
#How many actions of each type for each repo_url
action.join(repo,action.action_repo_id == repo.repo_id,'inner').groupBy('repo_url','action_type').count().sort('repo_url').withColumnRenamed('count','Total Actions').display()

repo_url,action_type,Total Actions
https://api.github.com/repos/0xStefan214/javascript-files,PushEvent,1
https://api.github.com/repos/0xsequence/erc-1155,WatchEvent,1
https://api.github.com/repos/117010130/hello-world,PushEvent,1
https://api.github.com/repos/1Password/scim-examples,PullRequestReviewEvent,1
https://api.github.com/repos/4vent/ArcadeTools,PushEvent,1
https://api.github.com/repos/8fn/exercicios-php,PublicEvent,1
https://api.github.com/repos/ABHINAV-GOPINADH/java-expt-2,CreateEvent,1
https://api.github.com/repos/AIAcademyBatchH/TensorFlow_ANN,PushEvent,1
https://api.github.com/repos/AIT-LAHCEN/Extranet_RH,CreateEvent,1
https://api.github.com/repos/Abh1shekSingh/Coursera_Solution,CreateEvent,1


In [0]:
#Repos with two or more actions
action.join(repo,action.action_repo_id == repo.repo_id,'inner').groupBy('repo_url').count().sort('repo_url').filter('count > 1').withColumnRenamed('count','Total Actions').display()

repo_url,Total Actions
https://api.github.com/repos/BornToBeRoot/NETworkManager,2
https://api.github.com/repos/CSeanXu/b1llion,6
https://api.github.com/repos/CaeCur/lights-out,2
https://api.github.com/repos/Codecademy/gamut,4
https://api.github.com/repos/DataDog/documentation,4
https://api.github.com/repos/DimensionDev/Maskbook,3
https://api.github.com/repos/DonQuixoteJoker/Wabo,2
https://api.github.com/repos/EthereansOS/ITEMS-swap,2
https://api.github.com/repos/GuruCICDCanary-Prod-Release/CICDCanary,13
https://api.github.com/repos/Jason2866/tmp_copy,2


In [0]:
#Actions per month for each repo_url
resault = action.join(repo,action.action_repo_id == repo.repo_id,'inner')
resault.join(timestamp, resault.time_id == timestamp.time_id,'inner').groupBy('repo_url','day_created').count().sort('repo_url','day_created').display()

repo_url,day_created,count
https://api.github.com/repos/0xStefan214/javascript-files,12,1
https://api.github.com/repos/0xsequence/erc-1155,24,1
https://api.github.com/repos/117010130/hello-world,17,1
https://api.github.com/repos/1Password/scim-examples,19,1
https://api.github.com/repos/4vent/ArcadeTools,31,1
https://api.github.com/repos/8fn/exercicios-php,17,1
https://api.github.com/repos/ABHINAV-GOPINADH/java-expt-2,25,1
https://api.github.com/repos/AIAcademyBatchH/TensorFlow_ANN,28,1
https://api.github.com/repos/AIT-LAHCEN/Extranet_RH,14,1
https://api.github.com/repos/Abh1shekSingh/Coursera_Solution,21,1


## Attempting to set up for NLP

In [0]:
!pip install spark-nlp

Collecting spark-nlp
  Using cached spark_nlp-5.1.2-py2.py3-none-any.whl (536 kB)
Installing collected packages: spark-nlp
Successfully installed spark-nlp-5.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [0]:
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import LanguageDetectorDL
from sparknlp.base import *
# Does not work due to needing to install a JAR cluster wide
# This can be found several stack overflow discussions.
# The following is a possible solution, however it did not work for us
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")  # Will take your column name and output a diff name