In [0]:
import time 
import requests
import boto3
from datetime import date
from botocore.client import Config
import json
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import pandas as pd
from io import BytesIO

In [0]:
# Extract traders data from: coinapi.io 
def extract_traders_data():
    url = "https://rest.coinapi.io/v1/trades/latest"
    headers = {"X-CoinAPI-Key" : dbutils.secrets.get(scope = "crypto", key = "crypto_key")}
    response = requests.get(url, headers=headers)
    data = response.json()
    bucket_name = "crypto-currency-data-prosimplee"
    file_name = "crypto/data/bronze/" + str(date.today()) + "_crypto_.json"
    s3 = boto3.resource("s3")
    try:
        s3.Bucket(bucket_name).put_object(Key=file_name, Body=json.dumps(data), ACL='private')
        print("Success")
    except ValueError:
        print("Extract Traders Data: FAILED!")
    
extract_traders_data()

Success


In [0]:
# Get bronze values from S3 Bucket & Transform & Put silver data into S3 Bucket 
def silver_crypto():
    s3 = boto3.resource("s3")
    content_object = s3.Object("crypto-currency-data-prosimplee", "crypto/data/bronze/" + str(date.today()) + "_crypto_.json")
    try:
        file_content = content_object.get()["Body"].read().decode("utf-8")
        json_content = json.loads(file_content)
        crypto_silver_data = []
        for crypto_raw in json_content:
            crypto_silver_data.append({
                "user_id": crypto_raw["uuid"],    
                "symbol_id": crypto_raw["symbol_id"], 
                "action": crypto_raw["taker_side"], 
                "size": crypto_raw["size"],
                "time_exchange": crypto_raw["time_exchange"]})
        
        crypto_silver_df = pd.DataFrame(crypto_silver_data)   
        bucket_name = "crypto-currency-data-prosimplee"
        file_name = "crypto/data/silver/" + str(date.today()) + "_crypto_.parquet"
        s3 = boto3.resource("s3")
        out_buffer = BytesIO()
        try:
            crypto_silver_df.to_parquet(out_buffer, index=False)   
            s3.Bucket(bucket_name).put_object(Key=file_name, Body=out_buffer.getvalue(), ACL="private")
            print("Success")
        except ValueError:
            print("Parquet traders data (silver) values into S3: FAILED!")
                                                               
    except ValueError:
        print("Connection to S3 (traders data): FAILED!")
        
silver_crypto()

Success


In [0]:
# Create SparkSession
spark = SparkSession.builder.getOrCreate()

In [0]:
# Get traders data (gold) from S3 Bucket & Create Table (crypto_data)
spark.read \
.option("inferSchema", True) \
.parquet("s3a://crypto-currency-data-prosimplee/crypto/data/silver/" + str(date.today()) + "_crypto_.parquet") \
.createOrReplaceTempView("crypto_data")

spark.conf.set("spark.databricks.io.cache.enabled", False)

golden_crypto_table = spark.table("crypto_data")

golden_crypto_table.limit(10).toPandas()

Unnamed: 0,user_id,symbol_id,action,size,time_exchange
0,25544cfb-ad09-431c-85f6-810bccb215e5,COINBASE_SPOT_BTC_USD,BUY,0.001077,2022-08-04T06:22:40.7607350Z
1,2ec0d1e1-1466-4d7a-846d-8a6250bf8e0f,MANDALA_SPOT_CTK_USDT,BUY,104.0,2022-08-04T06:22:40.6940000Z
2,39660252-089d-4d0a-8ea0-61c80cdc3b34,HUOBIPRO_SPOT_KOI_USDT,BUY,396.8196,2022-08-04T06:22:40.6960000Z
3,8e1aa4b4-aa07-4d4c-a1cd-e3cf2ee8d2b0,HUOBIPRO_SPOT_KOI_USDT,SELL,699.6202,2022-08-04T06:22:40.6970000Z
4,a867e462-a090-49ea-9ecf-8e43a0ce5b34,HUOBIPRO_SPOT_KOI_USDT,SELL,270.9761,2022-08-04T06:22:40.6970000Z
5,733c2dfc-c8d8-4678-b6bc-b3fb70e6681c,HUOBIPRO_SPOT_KOI_USDT,SELL,290.6597,2022-08-04T06:22:40.6970000Z
6,4f988009-d9e2-4207-8e0b-c4f3d685c149,HUOBIPRO_SPOT_KOI_USDT,BUY,361.3711,2022-08-04T06:22:40.6970000Z
7,0ddc19ad-de73-40ae-9518-567e007d841d,HUOBIPRO_SPOT_KOI_USDT,BUY,15.7649,2022-08-04T06:22:40.6980000Z
8,d39cd046-b9ec-41d3-8ae1-65e9aed710fe,HUOBIPRO_SPOT_KOI_USDT,SELL,954.8314,2022-08-04T06:22:40.6980000Z
9,869fa7b8-f65c-480d-a7fb-4e1455f1bd01,CRYPTOCOM_SPOT_ONE_USDC,BUY,4.8,2022-08-04T06:22:40.6130000Z


In [0]:
# Functions for take symbol_from and symbol_to 
def extract_symbol_from(column):
    symb_from = column.split("_")[2]
    return symb_from

sym_from_udf = F.udf(extract_symbol_from)

def extract_symbol_to(column):
    symb_to = column.split("_")[3]
    return symb_to

sym_to_udf = F.udf(extract_symbol_to)
    

In [0]:
# Apply Functions on our columns
golden_crypto = golden_crypto_table \
    .withColumn("symbol_from",sym_from_udf(golden_crypto_table.symbol_id)) \
    .withColumn("symbol_to",sym_to_udf(golden_crypto_table.symbol_id))\
    .withColumn("time_exchange", to_timestamp(F.col("time_exchange").cast("timestamp"))) 


In [0]:
golden_crypto.toPandas().head(10)

Unnamed: 0,user_id,symbol_id,action,size,time_exchange,symbol_from,symbol_to
0,25544cfb-ad09-431c-85f6-810bccb215e5,COINBASE_SPOT_BTC_USD,BUY,0.001077,2022-08-04 06:22:40.760735,BTC,USD
1,2ec0d1e1-1466-4d7a-846d-8a6250bf8e0f,MANDALA_SPOT_CTK_USDT,BUY,104.0,2022-08-04 06:22:40.694000,CTK,USDT
2,39660252-089d-4d0a-8ea0-61c80cdc3b34,HUOBIPRO_SPOT_KOI_USDT,BUY,396.8196,2022-08-04 06:22:40.696000,KOI,USDT
3,8e1aa4b4-aa07-4d4c-a1cd-e3cf2ee8d2b0,HUOBIPRO_SPOT_KOI_USDT,SELL,699.6202,2022-08-04 06:22:40.697000,KOI,USDT
4,a867e462-a090-49ea-9ecf-8e43a0ce5b34,HUOBIPRO_SPOT_KOI_USDT,SELL,270.9761,2022-08-04 06:22:40.697000,KOI,USDT
5,733c2dfc-c8d8-4678-b6bc-b3fb70e6681c,HUOBIPRO_SPOT_KOI_USDT,SELL,290.6597,2022-08-04 06:22:40.697000,KOI,USDT
6,4f988009-d9e2-4207-8e0b-c4f3d685c149,HUOBIPRO_SPOT_KOI_USDT,BUY,361.3711,2022-08-04 06:22:40.697000,KOI,USDT
7,0ddc19ad-de73-40ae-9518-567e007d841d,HUOBIPRO_SPOT_KOI_USDT,BUY,15.7649,2022-08-04 06:22:40.698000,KOI,USDT
8,d39cd046-b9ec-41d3-8ae1-65e9aed710fe,HUOBIPRO_SPOT_KOI_USDT,SELL,954.8314,2022-08-04 06:22:40.698000,KOI,USDT
9,869fa7b8-f65c-480d-a7fb-4e1455f1bd01,CRYPTOCOM_SPOT_ONE_USDC,BUY,4.8,2022-08-04 06:22:40.613000,ONE,USDC


In [0]:
# Loading traders data into MSSQL database
db_crypto_data = golden_crypto.select(F.col("user_id"), 
                                      F.col("action"), 
                                      F.col("size"), 
                                      F.col("symbol_from"), 
                                      F.col("symbol_to"), 
                                      F.col("time_exchange"))

db_crypto_data.orderBy(F.col('time_exchange'), ascending = True).toPandas().head(10)

Unnamed: 0,user_id,action,size,symbol_from,symbol_to,time_exchange
0,64900d24-02de-4789-aed8-b51a6c6d7c2b,BUY,4.76,DYDX,USDT,2022-08-04 06:22:40.023
1,9b162209-3cab-47d3-9d18-54d2762c96f5,SELL,0.01065,BTC,USDT,2022-08-04 06:22:40.024
2,2c4d9038-ac20-4612-82b5-4cbcdca98c26,SELL,0.00429,BTC,USDT,2022-08-04 06:22:40.024
3,49f8422e-b66b-4a93-b361-46dc9d6c0336,SELL,0.01069,BTC,USDT,2022-08-04 06:22:40.024
4,88c7a1aa-f198-434e-b91d-0fe105739595,BUY,0.00262,BTC,USDT,2022-08-04 06:22:40.033
5,7e0a7ce5-649e-48eb-9e4e-b1c4e99eef7d,BUY,0.01889,BTC,USDT,2022-08-04 06:22:40.033
6,adef874c-a7ab-4fad-82aa-840479af470a,BUY,0.04551,BTC,USDT,2022-08-04 06:22:40.033
7,a1fc09d3-03b5-4774-b5b2-c0796c2015d6,SELL,1.24,SOL,EUR,2022-08-04 06:22:40.033
8,bafa038a-2f4f-486c-9846-048e52f03f39,SELL,0.04369,BTC,USDT,2022-08-04 06:22:40.046
9,6323c28b-b2dc-46d5-a144-053c52fe3ad3,SELL,0.01897,BTC,USDT,2022-08-04 06:22:40.054


In [0]:
database = dbutils.secrets.get(scope = "database", key = "name")
table = "dbo.crypto_data"
user = dbutils.secrets.get(scope = "username", key = "usr")
password  = dbutils.secrets.get(scope = "mssql", key = "password")
server_name = dbutils.secrets.get(scope = "server", key = "name")

try:
    db_crypto_data.write.mode("append") \
        .format("jdbc") \
        .option("url", f"jdbc:sqlserver://{server_name};databaseName={database};") \
        .option("dbtable", table) \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .save()
    print("Success")
except:
    print("Write to MSSQL (traders data): FAILED!")
    

Success


In [0]:
# Extract (bronze) a directory of all cryptocurrency names
def via_api_extract_crypto_names():
    url = "https://rest.coinapi.io/v1/assets"
    headers = {"X-CoinAPI-Key" : dbutils.secrets.get(scope = "crypto", key = "crypto_key")}
    response = requests.get(url, headers=headers)
    data = response.json()
    bucket_name = "crypto-currency-data-prosimplee"
    file_name = "crypto/catalog/stag/crypto_catalog.json"
    s3 = boto3.resource("s3")
    try:
        s3.Bucket(bucket_name).put_object(Key=file_name, Body=json.dumps(data), ACL="private")
        print("Success")
    except ValueError:
        print("Extract Crypto Names via Api: FAILED!")
    
via_api_extract_crypto_names()

Success


In [0]:
# Extract (parquet) a directory of all cryptocurrency names
def from_s3_extract_crypto_names():
    s3 = boto3.resource("s3")
    content_object = s3.Object("crypto-currency-data-prosimplee", "crypto/catalog/stag/crypto_catalog.json")
    try:
        file_content = content_object.get()["Body"].read().decode("utf-8")
        json_content = json.loads(file_content)
        crypto_names = []
        for cr_n in json_content:
            try:
                dictionary_crypto = {"symbol_id" : cr_n["asset_id"], 
                                     "symbol_name" : cr_n["name"]}
                crypto_names.append(dictionary_crypto)
            except ValueError:
                print("Crypto Name ValueError!")

        crypto_name_dictionary = pd.DataFrame(crypto_names)
        bucket_name = "crypto-currency-data-prosimplee"
        file_name = "crypto/catalog/clean/crypto_catalog.parquet"
        s3 = boto3.resource("s3")
        out_buffer = BytesIO()
        try:
            crypto_name_dictionary.to_parquet(out_buffer, index=False)   
            s3.Bucket(bucket_name).put_object(Key=file_name, Body=out_buffer.getvalue(), ACL="private")
            print("Success")
        except ValueError:
            print("Parquet Crypto Names (silver) values into S3: FAILED!")                                                        
    except ValueError:
        print("Connection to S3 (crypto names): FAILED!")
        
from_s3_extract_crypto_names()

Success


In [0]:
# Create Table (crypto_names)
try:
    spark.read\
    .option("inferSchema", True)\
    .parquet("s3a://crypto-currency-data-prosimplee/crypto/catalog/clean/crypto_catalog.parquet")\
    .createOrReplaceTempView("crypto_names")

    cr_names_table = spark.table("crypto_names")
    print("Created Table (crypto_names): Success!")
except ValueError:
    print("Create catalog with crypto names: FAILED!")  
    
cr_names_table.limit(10).toPandas()

Created Table (crypto_names): Success!


Unnamed: 0,symbol_id,symbol_name
0,USD,US Dollar
1,BTC,Bitcoin
2,PLN,Zloty
3,EUR,Euro
4,CNY,Yuan Renminbi
5,JPY,Yen
6,AUD,Australian Dollar
7,CHF,Swiss Franc
8,SEK,Swedish Krona
9,GBP,Pound Sterling


In [0]:
# Search for the average sale size of cryptocurrencies
avg_size_sell = golden_crypto \
    .groupBy(F.col("symbol_from"), F.col("symbol_to"), F.col("action")) \
    .agg(F.avg(F.col("size")).alias("avg_size")) \
    .where(F.col("action") == "SELL")

avg_size_sell.limit(10).toPandas()

Unnamed: 0,symbol_from,symbol_to,action,avg_size
0,YFI,USDT,SELL,0.000919
1,USDT,BRL,SELL,11.35
2,SOL,USDT,SELL,2.0
3,GALA,USDT,SELL,8700.0
4,OP,USDT,SELL,121.05
5,EOS,USDT,SELL,1.0
6,AVAX,USDT,SELL,19.671667
7,SPARTA,BNB,SELL,4410.0
8,CTK,USDT,SELL,80.485714
9,YFII,USDT,SELL,0.03


In [0]:
result_sell = avg_size_sell.join(cr_names_table, avg_size_sell.symbol_from == cr_names_table.symbol_id, how = "inner").select(F.col("symbol_from"), 
                                                                                                                              F.col("symbol_name").alias("symbol_from_name"), 
                                                                                                                              F.col("symbol_to"), 
                                                                                                                              F.col("avg_size"))
sell_df = result_sell.join(cr_names_table, avg_size_sell.symbol_to == cr_names_table.symbol_id, how = "inner").select(F.col("symbol_from"),
                                                                                                                      F.col("symbol_from_name"), 
                                                                                                                      F.col("symbol_to"),
                                                                                                                      F.col("symbol_name").alias("symbol_to_name"), 
                                                                                                                      F.col("avg_size"))
sell_df.limit(10).toPandas()

Unnamed: 0,symbol_from,symbol_from_name,symbol_to,symbol_to_name,avg_size
0,BTC,Bitcoin,BUSD,Binance USD,0.08946
1,BTC,Bitcoin,USDT,Tether,0.019387
2,BTC,Bitcoin,EUR,Euro,0.01757
3,USDT,Tether,BRL,Brazilian Real,11.35
4,ETC,Ethereum Classic,BTC,Bitcoin,2.62
5,ETC,Ethereum Classic,USDT,Tether,15.3
6,ATOM,Cosmos,USDT,Tether,26.25
7,OP,Operand,USDT,Tether,121.05
8,CTK,CTK,USDT,Tether,80.485714
9,EOS,EOS,USDT,Tether,1.0


In [0]:
# The received data is written to the S3 bucket
sell_df.write \
 .mode("OVERWRITE") \
 .option("header","true") \
 .parquet("s3a://crypto-currency-data-prosimplee/crypto/data/gold/sell_" + str(date.today()) + "_crypto_.parquet")

In [0]:
# Search for the average buy size of cryptocurrencies
avg_size_buy = golden_crypto \
    .groupBy(F.col("symbol_from"), F.col("symbol_to"), F.col("action")) \
    .agg(F.avg(F.col("size")).alias("avg_size")) \
    .where(F.col("action") == "BUY")

avg_size_buy.limit(10).toPandas()

Unnamed: 0,symbol_from,symbol_to,action,avg_size
0,ETH,BTC,BUY,0.0115
1,IMX,USDT,BUY,50.625
2,BTC,USD,BUY,0.001077
3,KOI,USDT,BUY,257.9852
4,SOL,BTC,BUY,6.22
5,DYDX,USDT,BUY,4.76
6,INJ,USDT,BUY,151.6
7,MANA,USDT,BUY,800.0
8,SUSHI,USDT,BUY,172.1
9,ETC,BUSD,BUY,29.34


In [0]:
avg_size_buy = avg_size_buy.join(cr_names_table, avg_size_buy.symbol_from == cr_names_table.symbol_id, how = "inner").select(F.col("symbol_from"), 
                                                                                                                             F.col("symbol_name").alias("symbol_from_name"), 
                                                                                                                             F.col("symbol_to"), 
                                                                                                                             F.col("avg_size"))
buy_df = avg_size_buy.join(cr_names_table, avg_size_buy.symbol_to == cr_names_table.symbol_id, how = "inner").select(F.col("symbol_from"),
                                                                                                                     F.col("symbol_from_name"), 
                                                                                                                     F.col("symbol_to"),
                                                                                                                     F.col("symbol_name").alias("symbol_to_name"), 
                                                                                                                     F.col("avg_size"))
buy_df.limit(10).toPandas()

Unnamed: 0,symbol_from,symbol_from_name,symbol_to,symbol_to_name,avg_size
0,BTC,Bitcoin,USDT,Tether,0.019505
1,BTC,Bitcoin,BUSD,Binance USD,0.005875
2,BTC,Bitcoin,USD,US Dollar,0.001077
3,USDT,Tether,TRY,Turkish Lira,3006.0
4,ETH,Ethereum,USDT,Tether,2.0
5,ETH,Ethereum,BTC,Bitcoin,0.0115
6,DASH,Dash,USDT,Tether,2.424
7,ETC,Ethereum Classic,BUSD,Binance USD,29.34
8,STX,Stox,USDT,Tether,590.9
9,CTK,CTK,USDT,Tether,104.0


In [0]:
# The received data is written to the S3 bucket
avg_size_buy.write \
 .mode("OVERWRITE") \
 .option("header","true") \
 .parquet("s3a://crypto-currency-data-prosimplee/crypto/data/gold/buy_" + str(date.today()) + "_crypto_.parquet")