## 01_data_preprocessing.ipynb
**Project:** Stablecoin Liquidity as a Proxy for Capital Allocation  
**Purpose:** Load raw datasets (CoinGecko stablecoins, FRED macro variables, optional DeFi data), clean and merge them, and save preprocessed CSVs for analysis

In [331]:
import pandas as pd 
import numpy as np
import os

In [332]:
os.chdir('/Users/erikasohn/Library/Mobile Documents/com~apple~CloudDocs/Documents/classes/khoury')
print("\nFiles in current directory:")
print(os.listdir('.'))


Files in current directory:
['.DS_Store', 'ds3500_fa25', 'ds3500_source', 'ds4200_fa25', 'cs3200_fa25']


#### Data Processing 

In [333]:
# -- Load df -- # 

# FRED/USDC - Fix these paths 
m2 = pd.read_csv('ds4200_fa25/coin-project/data/raw/M2SL.csv')
treasury = pd.read_csv('ds4200_fa25/coin-project/data/raw/DGS10.csv') 
usdc = pd.read_csv('ds4200_fa25/coin-project/data/raw/USDC.csv')

In [334]:
print("M2 Money Supply:")
print(m2.head())
print("\n" + "="*80 + "\n")

print("Treasury (10-Year):")
print(treasury.head())
print("\n" + "="*80 + "\n")

print("USDC:")
print(usdc.head())
print("\n" + "="*80 + "\n")

M2 Money Supply:
  observation_date     M2SL
0       2020-01-01  15416.3
1       2020-02-01  15466.6
2       2020-03-01  15993.4
3       2020-04-01  17005.4
4       2020-05-01  17852.9


Treasury (10-Year):
  observation_date  DGS10
0       2020-01-13   1.85
1       2020-01-14   1.82
2       2020-01-15   1.79
3       2020-01-16   1.81
4       2020-01-17   1.84


USDC:
                snapped_at     price  market_cap  total_volume
0  2018-10-05 00:00:00 UTC  1.006242         0.0  31264.420430
1  2018-10-06 00:00:00 UTC  1.001530         0.0  20254.712255
2  2018-10-07 00:00:00 UTC  1.001177         0.0  49324.690669
3  2018-10-08 00:00:00 UTC  1.001906         0.0  47076.728142
4  2018-10-09 00:00:00 UTC  1.001983         0.0  55542.215509




In [335]:
print("M2 Money Supply:")
print(m2.describe())
print("\n" + "="*80 + "\n")

print("Treasury (10-Year):")
print(treasury.describe())
print("\n" + "="*80 + "\n")

print("USDC:")
print(usdc.describe())
print("\n" + "="*80 + "\n")

M2 Money Supply:
               M2SL
count     69.000000
mean   20531.646377
std     1527.829100
min    15416.300000
25%    20472.900000
50%    20979.100000
75%    21492.400000
max    22212.500000


Treasury (10-Year):
             DGS10
count  1461.000000
mean      2.934305
std       1.413002
min       0.520000
25%       1.530000
50%       3.530000
75%       4.220000
max       4.980000


USDC:
             price    market_cap  total_volume
count  2598.000000  2.598000e+03  2.598000e+03
mean      1.000807  2.713143e+10  4.347705e+09
std       0.003814  2.230361e+10  4.859625e+09
min       0.965578  0.000000e+00  2.025471e+04
25%       0.999663  1.088678e+09  1.017035e+09
50%       1.000045  2.798853e+10  3.172612e+09
75%       1.001077  4.382889e+10  5.986644e+09
max       1.043465  7.678075e+10  1.102918e+11




In [336]:
print(f"{(m2.isnull().sum().sum() / (len(m2) * len(m2.columns)) * 100):.2f}% values null in M2 Money Supply")

print(f"{(treasury.isnull().sum().sum() / (len(treasury) * len(treasury.columns)) * 100):.2f}% values null in Treasury (10-Year)")

print(f"{(usdc.isnull().sum().sum() / (len(usdc) * len(usdc.columns)) * 100):.2f}% values null in USDC")

0.00% values null in M2 Money Supply
2.07% values null in Treasury (10-Year)
0.00% values null in USDC


Warnings
- **Treasury:** 2.07% values are null. This is insignificant, can dropna()

#### Data Cleaning

In [337]:
# Clean treasury df 
print(f"Old null values treasury: \n{treasury.isnull().sum()}") 

Old null values treasury: 
observation_date     0
DGS10               63
dtype: int64


In [338]:
treasury = treasury.dropna()

In [339]:
print(f"New null values treasury: \n{treasury.isnull().sum()}") 

New null values treasury: 
observation_date    0
DGS10               0
dtype: int64


In [340]:
# -- Convert to datetime -- #

# FRED
m2['observation_date'] = pd.to_datetime(m2['observation_date'])
treasury['observation_date'] = pd.to_datetime(treasury['observation_date'])

# USDC
usdc['snapped_at'] = pd.to_datetime(usdc['snapped_at'], utc=True)

# -- Set index -- # 
m2 = m2.set_index('observation_date')
treasury = treasury.set_index('observation_date')
usdc = usdc.set_index('snapped_at')

# -- Drop duplicate columns if they exist -- #
if 'observation_date' in usdc.columns:
    usdc = usdc.drop('observation_date', axis=1)
if 'Date' in usdc.columns:
    usdc = usdc.drop('Date', axis=1)

In [341]:
# -- Ensure numeric columns are correct dtype -- # 

# FRED  
m2['M2SL'] = pd.to_numeric(m2['M2SL'], errors='coerce')
treasury['DGS10'] = pd.to_numeric(treasury['DGS10'], errors='coerce')

# USDC
usdc['price'] = pd.to_numeric(usdc['price'], errors='coerce')
usdc['market_cap'] = pd.to_numeric(usdc['market_cap'], errors='coerce')
usdc['total_volume'] = pd.to_numeric(usdc['total_volume'], errors='coerce')


In [342]:
# Verify dtypes
print("M2 dtypes:")
print(m2.dtypes)
print("\n" + "=" * 80)

print("Treasury dtypes:")
print(treasury.dtypes)
print("\n" + "=" * 80)

print("USDC dtypes:")
print(usdc.dtypes)
print("\n" + "=" * 80)

M2 dtypes:
M2SL    float64
dtype: object

Treasury dtypes:
DGS10    float64
dtype: object

USDC dtypes:
price           float64
market_cap      float64
total_volume    float64
dtype: object



#### Data Normalization

In [343]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler() 

# M2
m2_normalized = m2.copy()
m2_normalized[['M2SL']] = scaler.fit_transform(m2[['M2SL']])

# Treasury
treasury_normalized = treasury.copy()
treasury_normalized[['DGS10']] = scaler.fit_transform(treasury[['DGS10']])

# USDC
usdc_normalized = usdc.copy()
usdc_normalized[['price', 'market_cap', 'total_volume']] = scaler.fit_transform(
    usdc[['price', 'market_cap', 'total_volume']]
)

# -- Sanity check normalized data -- #

print("M2 normalized stats:")
print(m2_normalized[['M2SL']].describe())
print("\n" + "="*80 + "\n")

print("Treasury normalized stats:")
print(treasury_normalized[['DGS10']].describe())
print("\n" + "="*80 + "\n")

print("USDC normalized stats:")
print(usdc_normalized[['price', 'market_cap', 'total_volume']].describe())
print("\n" + "="*80 + "\n")

M2 normalized stats:
               M2SL
count  6.900000e+01
mean   4.022547e-16
std    1.007326e+00
min   -3.372643e+00
25%   -3.873258e-02
50%    2.950145e-01
75%    6.334427e-01
max    1.108218e+00


Treasury normalized stats:
              DGS10
count  1.461000e+03
mean   2.334432e-16
std    1.000342e+00
min   -1.709220e+00
25%   -9.941855e-01
50%    4.217253e-01
75%    9.102145e-01
max    1.448261e+00


USDC normalized stats:
              price    market_cap  total_volume
count  2.598000e+03  2.598000e+03  2.598000e+03
mean  -4.111193e-14 -1.312781e-16 -2.187968e-16
std    1.000193e+00  1.000193e+00  1.000193e+00
min   -9.238612e+00 -1.216693e+00 -8.948267e-01
25%   -3.001325e-01 -1.167872e+00 -6.855079e-01
50%   -1.999444e-01  3.843620e-02 -2.418540e-01
75%    7.077865e-02  7.487881e-01  3.373213e-01
max    1.118667e+01  2.226495e+00  2.180507e+01




In [344]:
print("M2 date range:", m2.index.min(), "to", m2.index.max())
print("Treasury date range:", treasury.index.min(), "to", treasury.index.max())
print("USDC date range:", usdc.index.min(), "to", usdc.index.max())


M2 date range: 2020-01-01 00:00:00 to 2025-09-01 00:00:00
Treasury date range: 2020-01-13 00:00:00 to 2025-11-13 00:00:00
USDC date range: 2018-10-05 00:00:00+00:00 to 2025-11-14 00:00:00+00:00


In [345]:
# -- Aggregate data by time periods -- #

# M2 (already monthly data, but in case)
monthly_m2 = m2.resample('M').agg({
    'M2SL': 'mean'
})

# Treasury (aggregate daily to weekly/monthly)
daily_treasury = treasury.resample('D').agg({
    'DGS10': 'mean'
})

monthly_treasury = treasury.resample('M').agg({
    'DGS10': 'mean'
})

# USDC (aggregate daily)
daily_usdc = usdc.resample('D').agg({
    'price': 'mean',
    'market_cap': 'mean',
    'total_volume': 'sum'
})

monthly_usdc = usdc.resample('M').agg({
    'price': 'mean',
    'market_cap': 'mean',
    'total_volume': 'sum'
})

# -- Sanity check aggregated data -- #

print("Monthly M2:")
print(monthly_m2.head())
print(f"Shape: {monthly_m2.shape}\n")
print("="*80 + "\n")

print("Daily Treasury:")
print(daily_treasury.head())
print(f"Shape: {daily_treasury.shape}\n")
print("="*80 + "\n")

print("Daily USDC:")
print(daily_usdc.head())
print(f"Shape: {daily_usdc.shape}\n")
print("="*80 + "\n")

Monthly M2:
                     M2SL
observation_date         
2020-01-31        15416.3
2020-02-29        15466.6
2020-03-31        15993.4
2020-04-30        17005.4
2020-05-31        17852.9
Shape: (69, 1)


Daily Treasury:
                  DGS10
observation_date       
2020-01-13         1.85
2020-01-14         1.82
2020-01-15         1.79
2020-01-16         1.81
2020-01-17         1.84
Shape: (2132, 1)


Daily USDC:
                              price  market_cap  total_volume
snapped_at                                                   
2018-10-05 00:00:00+00:00  1.006242         0.0  31264.420430
2018-10-06 00:00:00+00:00  1.001530         0.0  20254.712255
2018-10-07 00:00:00+00:00  1.001177         0.0  49324.690669
2018-10-08 00:00:00+00:00  1.001906         0.0  47076.728142
2018-10-09 00:00:00+00:00  1.001983         0.0  55542.215509
Shape: (2598, 3)




  monthly_m2 = m2.resample('M').agg({
  monthly_treasury = treasury.resample('M').agg({
  monthly_usdc = usdc.resample('M').agg({


In [348]:
# Convert daily treausry -> monthly treausry 

# Treasury 
monthly_treasury = treasury.resample('M').agg({
    'DGS10': 'mean'
})

# USDC  
monthly_usdc = usdc.resample('M').agg({
    'price': 'mean',
    'market_cap': 'mean',
    'total_volume': 'sum'
})

# -- Sanity check new aggregated data -- #

print("Monthly M2:")
print(monthly_m2.head())
print(f"Shape: {monthly_m2.shape}\n")
print("="*80 + "\n")

print("Daily Treasury:")
print(monthly_treasury.head())
print(f"Shape: {daily_treasury.shape}\n")
print("="*80 + "\n")

print("Daily USDC:")
print(monthly_usdc.head())
print(f"Shape: {daily_usdc.shape}\n")
print("="*80 + "\n")

Monthly M2:
                     M2SL
observation_date         
2020-01-31        15416.3
2020-02-29        15466.6
2020-03-31        15993.4
2020-04-30        17005.4
2020-05-31        17852.9
Shape: (69, 1)


Daily Treasury:
                     DGS10
observation_date          
2020-01-31        1.717143
2020-02-29        1.504211
2020-03-31        0.870000
2020-04-30        0.657619
2020-05-31        0.674000
Shape: (2132, 1)


Daily USDC:
                              price    market_cap  total_volume
snapped_at                                                     
2018-10-31 00:00:00+00:00  1.013544  3.520434e+07  7.764469e+06
2018-11-30 00:00:00+00:00  1.015745  1.491097e+08  8.204609e+07
2018-12-31 00:00:00+00:00  1.003368  2.076971e+08  1.963056e+09
2019-01-31 00:00:00+00:00  0.998053  3.314265e+08  1.453943e+10
2019-02-28 00:00:00+00:00  1.009007  2.672576e+08  1.032981e+10
Shape: (2598, 3)




  monthly_treasury = treasury.resample('M').agg({
  monthly_usdc = usdc.resample('M').agg({
