In [None]:
import requests
import pandas as pd
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime
from io import StringIO
import json

# Crawling

In [None]:
def get_list(url:str, types:str=None):
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data)
    if types=="protocol":
        df["protocol"] = df["name"].apply(lambda x: x.lower().replace(" ", "-"))
    df.to_excel(f"{types}.xlsx", index=False)
    print(df.shape, df.columns)
    return df

In [None]:
def get_data(url:str, keys:pd.Series, save_path:str="html"):
    options = Options()
    # React로 만들어진 웹 페이지의 경우 headless 모드에서 
    # javascript unenable로 인하여 데이터를 가져오지 못함
    # options.add_argument("--headless")  
    driver = webdriver.Chrome(options=options)
    for v in keys:
        driver.get(url+v)
        sleep(10)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        next_data_script = soup.find('script', id="__NEXT_DATA__")
        try:
            data = next_data_script.string
            data = json.loads(data)
            if "error" in data["page"]:
                print(v, data["err"])
                continue
            if not next_data_script.string:
                print(v, "id='__NEXT_DATA__'를 가진 script 태그를 찾을 수 없습니다.")
                continue
            
            data = next_data_script.string
            try:
                with open(f"{save_path}/{v}.json", "w") as f:
                    f.write(data)
            except UnicodeEncodeError as e:
                with open(f"{save_path}/{v}.json", "w", encoding="utf8") as f:
                    f.write(data)
            
        except Exception as e:
            print(v, e)

    driver.quit() # 1847 min at 20s sleep

Protocol Dataset

In [None]:
url = "https://defillama.com/protocol/"
get_data(url, df_protocol["protocol"])
# 1847 min at 20s sleep

Chain Dataset

In [None]:
url = "https://defillama.com/chain/"
get_data(url, df_chain["name"], save_path="html_chain")
# 107min at 15s sleep

# Extract TVL

In [None]:
os.makedirs("df", exist_ok=True)

failed = {
    "Empty": [],
    "OneTvl": [],
    "UnicodeDecodeError": [],
    "KeyError": [],
    "Other": [],
}

In [None]:
def generate_df(times, network):
    times = times["tvl"]
    df = pd.DataFrame(times)
    df["date"] = df["date"].apply(lambda x: datetime.fromtimestamp(x)) 
    df[network] = df["totalLiquidityUSD"]
    return df[["date", network]]

def is_filled(df_hist, df_not):
    global failed

    if df_hist is None and df_not is None:
        print(f"Empty tvl in {v}")
        return True
    elif df_hist is None or df_not is None:
        print(f"Only one tvl in {v}: {df_hist.shape if df_hist else df_not.shape}")
        failed["OneTvl"].append(v)
        return True
    return False

def is_identical(df_hist, df_not):
    res = [
        (c, (df_hist[c] != df_not[c]).sum())
        for c in df_hist.columns
    ]
    result = (sum([v[-1] for v in res]) > 0)
    if result:
        print(f"Different in {v}")
        print(res)
    return not result

Protocol dataset to csv

In [None]:
for v in os.listdir("html"):
    try: 
        data = None
        try:
            with open(f"html/{v}", "r", encoding='utf8') as f:
                data = f.read()
        except UnicodeDecodeError as e:
            try:
                with open(f"html/{v}", "r", encoding='cp949') as f:
                    data = f.read()
            except Exception as e:
                raise e
            
        if not data:
            print(f"Empty in {v}")
            failed["Empty"].append(v)
            continue
        data = json.loads(data)
        data = data["props"]["pageProps"]["protocolData"]

        tvl = {
            "history": {
                "data": data["historicalChainTvls"],
                "df": None,
            },
            "nothist": {
                "data": data["chainTvls"],
                "df": None,
            }
        }
        
        for k in tvl.keys():
            dfs = [
                generate_df(times, network) 
                for network, times in tvl[k]["data"].items()
                if times and times["tvl"]
            ]
            
            if len(dfs) == 1:
                tvl[k]["df"] = dfs[0].fillna(0)
            elif dfs:
                result = dfs[0]
                for d in dfs[1:]:
                    result = pd.merge(result, d, on="date", how="outer")
                tvl[k]["df"] = result.fillna(0)

        if is_filled(tvl["history"]["df"], tvl["nothist"]["df"]):
            continue

        if is_identical(tvl["history"]["df"], tvl["nothist"]["df"]):
            tvl["nothist"]["df"].to_csv(
                f"df/{v.replace('.json', '.csv')}", index=False)

    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError in {v}: {e}")
        failed["UnicodeDecodeError"].append(v)
    except KeyError as e:
        print(f"KeyError in {v}: {e}")
        failed["KeyError"].append(v)
    except Exception as e:
        print(f"Error in {v}: {e}")
        failed["Other"].append(v)

In [None]:
failed

In [None]:
for v in os.listdir("df"):
    df = pd.read_csv(f"df/{v}")
    df = df.iloc[:, 1:].fillna(0)
    try:
        if df.sum().sum() == 0:
            os.remove(f"df/{v}")
    except Exception as e:
        print(v, e)

In [None]:
os.makedirs("result", exist_ok=True)
files = os.listdir("df")


for d in files:
    df = pd.read_csv(f"df/{d}")
    df["TotalTvl"] = df.iloc[:, 1:].sum(axis=1)
    if df["TotalTvl"].sum() == 0:
        continue
    name = d[:-4]
    df[name] = df["TotalTvl"]
    df.to_csv(f"df/{d}", index=False)

    df = df[["date", name]]
    # 연속된 중복값은 처음, 마지막 값만 남기고 제거
    df_filtered = df[(df[name] != df[name].shift(1)) | (df[name] != df[name].shift(-1))]
    df_filtered.to_csv(f"result/{d}", index=False)

In [None]:
notc = ["staking", "borrowed", "vesting", "offers", "pool2", "treasury"]

isin = lambda x: any([n in x for n in notc])

for v in os.listdir("df"):
    df = pd.read_csv(f"df/{v}")
    
    # 첫 열 제외하고 가로 행이 모두 0인 행은 삭제
    df = df.loc[(df.iloc[:, 1:] != 0).any(axis=1)]

    cols = []
    for c in df.columns[1:-2].values:
        if not isin(c):
            cols.append(c)
    df[v[:-4]] = df[cols].sum(axis=1)
    df.to_csv(f"df/{v}", index=False)

Chain dataset to csv

In [None]:
for v in failed: # chain csv download에서의 failed list
    v = v.replace("_", " ")
    if not os.path.exists(f"html/{v}.json"):
        continue

    try:
        with open(f"html/{v}.json", "r", encoding='utf8') as f:
            data = f.read()
    except UnicodeDecodeError as e:
        with open(f"html/{v}.json", "r", encoding='cp949') as f:
            data = f.read()

    if not data:
        continue
    data = json.loads(data)
    extra_data = data["props"]["pageProps"]["extraTvlCharts"]
    data = data["props"]["pageProps"]["chart"]
    data = pd.DataFrame({
        "date": [int(x[0]) for x in data],
        v: [x[1] for x in data]
    })

    temp = {}
    for k, v in extra_data.items():
        l = pd.DataFrame({
            "date": [int(x[0]) for x in v],
            k: [x[1] for x in v]
        })
        if l:
            temp[k] = l

    for k, v in temp.items():
        data = pd.merge(data, v, on="date", how="left")

    df["date"] = df["date"].apply(lambda x: datetime.fromtimestamp(x)) 
    df.to_csv(f'df_chain_web/{v.replace(" ", "_")}.csv', index=False)