In [6]:
import pandas as pd
import os
from pathlib import Path

ACTIONS = ["BUY", "SELL"]
ASSETS = ["BTC", "ETH", "SOL", "TIA"]

DATA_DIR = Path(os.getcwd()).parent.parent.parent / "data" 
RAW_DIR = DATA_DIR / "trades" / "raw"
OUTPUT_DIR = DATA_DIR / "trades" / "clean"

INPUT_FILE = RAW_DIR / "coinbase_raw.csv"
OUTPUT_FILE = OUTPUT_DIR / "coinbase_clean.csv"

In [7]:
# These are dates with short term trades that should not be included
EXEMPTIONS = {
    "BTC": ["2024-04-14", "2024-04-15", "2024-04-16", "2024-04-17"],
    "ETH": ["2022-06-08", "2023-10-31", "2023-11-08", "2023-12-02", "2024-03-13", "2024-03-14", "2024-04-14", "2024-04-23", "2024-04-24"],
    "SOL": ["2024-03-04", "2024-05-22", "2024-06-27", "2024-07-06", "2024-08-13", "2024-11-14", "2024-11-25", "2025-07-21"]
}

EXCLUDE_IDS = ["coinbase-6732391d24e2943aaf18722a"]

In [8]:
raw_df = pd.read_csv(INPUT_FILE)

In [9]:

coinbase_df = raw_df

coinbase_df["action"] = coinbase_df["Transaction Type"].str.upper()
coinbase_df["asset"] = coinbase_df["Asset"]

# If swapping in and out of USDC, it's considered a "Convert" action 
# Ex: Converted 0.0033 BTC to 270.989873 USDC, or Converted 270.989873 USDC to 0.0033 BTC 
# We need to (a) change the action to BUY/SELL, and (b) fix the asset and amount in the buy case,
# For instance, Converted 270.989873 USDC to 0.0033 BTC has USDC listed as the asset, but
# we want BTC. And 270.989873 is listed as the quantity but we want 0.0033
pattern = r'^Converted\s+([\d.]+)\s+(\w+)\s+to\s+([\d.]+)\s+(\w+)'
coinbase_df["converted_from_quantity"] = coinbase_df["Notes"].str.extract(pattern)[0]  
coinbase_df["converted_from_asset"] = coinbase_df["Notes"].str.extract(pattern)[1]  
coinbase_df["converted_to_quantity"] = coinbase_df["Notes"].str.extract(pattern)[2]  
coinbase_df["converted_to_asset"] = coinbase_df["Notes"].str.extract(pattern)[3]    

def normalize_convert_action(row: pd.Series) -> str:
    if row["action"] == "CONVERT":
        if row["converted_from_asset"] == "USDC":
            return "BUY"
        elif row["converted_to_asset"] == "USDC":
            return "SELL"
    return row["action"]

def fix_converted_asset(row: pd.Series) -> str:
    if row["converted_from_asset"] == "USDC":
        return row["converted_to_asset"]
    return row["asset"]

def fix_converted_quantity(row: pd.Series) -> str:
    if row["converted_from_asset"] == "USDC":
        return row["converted_to_quantity"]
    return row["Quantity Transacted"]

coinbase_df["action"] = coinbase_df.apply(normalize_convert_action, axis=1)
coinbase_df["asset"] = coinbase_df.apply(fix_converted_asset, axis=1)
coinbase_df["Quantity Transacted"] = coinbase_df.apply(fix_converted_quantity, axis=1)

coinbase_df = coinbase_df[coinbase_df.action.isin(ACTIONS)]
coinbase_df = coinbase_df[coinbase_df.asset.isin(ASSETS)]

coinbase_df["id"] = coinbase_df["ID"].map(lambda i: f"coinbase-{i}")
coinbase_df["platform"] = "coinbase"

coinbase_df["date"] = coinbase_df["Timestamp"].map(lambda i: pd.to_datetime(i).strftime("%Y-%m-%d"))
coinbase_df["quantity"] = coinbase_df["Quantity Transacted"].astype(float).abs()
coinbase_df["price"] = coinbase_df["Price at Transaction"].str.replace("$", "").astype(float).abs()
coinbase_df["fees"] = coinbase_df["Fees and/or Spread"].str.replace("$", "")
coinbase_df["cost"] = coinbase_df["Total (inclusive of fees and/or spread)"].str.replace("$", "")
coinbase_df["value"] = coinbase_df["quantity"].astype(float) * coinbase_df["price"].astype(float)
    
coinbase_df = coinbase_df[~coinbase_df.price.isna()]
coinbase_df = coinbase_df[coinbase_df.quantity != 0]


def exclude_trade(row):
    asset = row["asset"]
    date = row["date"][:10]
    return (asset in EXEMPTIONS and date in EXEMPTIONS[asset])

coinbase_df["excluded"] = coinbase_df.apply(exclude_trade, axis=1)
coinbase_df = coinbase_df[~coinbase_df.id.isin(EXCLUDE_IDS)]

coinbase_df = coinbase_df[["id", "platform", "date", "action", "asset", "price", "quantity", "fees", "cost", "value", "excluded"]]
coinbase_df = coinbase_df.sort_values(["date", "asset"]).reset_index(drop=True)

coinbase_df.head()

Unnamed: 0,id,platform,date,action,asset,price,quantity,fees,cost,value,excluded
0,coinbase-60b1a4765a49f30001783fa2,coinbase,2021-05-29,BUY,BTC,36348.265,0.013489,9.698253415,500.0,490.301747,False
1,coinbase-60b1a4c7d4ca9f0001c317d9,coinbase,2021-05-29,BUY,ETH,2510.495,0.195198,9.9559953108,500.0,490.044005,False
2,coinbase-60d2b69a60ad5c00010f6a62,coinbase,2021-06-23,BUY,SOL,29.801344,3.233423,3.63966398607658,100.0,96.360336,False
3,coinbase-6170124955f9cb000193eed6,coinbase,2021-10-20,SELL,BTC,64298.42,0.007451,7.11,470.14,479.084955,False
4,coinbase-617012a791333c0001475764,coinbase,2021-10-20,SELL,ETH,3938.955,0.094166,5.5,363.65,370.916976,False


In [12]:
assert coinbase_df.shape == (88, 11)

In [5]:
coinbase_df.to_csv(OUTPUT_FILE, index=False)