In [None]:
import polars as pl
import pandas as pd
from pathlib import Path
import zipfile

onedrive_path: Path = (
    Path.home()
    / "Library/CloudStorage/OneDrive-Personal/Documents/Asset Pricing/data"
)

if not onedrive_path.exists():
    raise FileNotFoundError("OneDrive path not found")

zip_file: Path = onedrive_path / "Stata Main Redux.zip"
assert zip_file.exists()

In [None]:
from io import BytesIO, StringIO

# Get contents and print them first
with zipfile.ZipFile(zip_file, "r") as z:
    z.printdir()


def open_inside_zip(sub_path: str) -> bytes:
    buffer = BytesIO()

    with zipfile.ZipFile(zip_file, "r") as z:
        with z.open(sub_path) as f:
            b = f.read()
            buffer.write(b)

    buffer.seek(0)
    # Print how many GB
    print(f"Read {buffer.getbuffer().nbytes / 1024**3:.2f} GB from {sub_path}")
    return buffer


df = pd.read_stata(open_inside_zip("worldscope_yearly_data.dta"))
df

In [None]:
pl_df = pl.DataFrame(df)
pl_df

In [None]:
pl_df.columns

In [None]:
df.value_counts("currency_WS")

In [None]:
# Count how many currencies we have
pl_df.group_by("currency_WS").len().sort("len").reverse()

In [None]:
def read_file(name: str) -> pl.DataFrame:
    buf = open_inside_zip(name)
    pd_df = pd.read_stata(buf)
    return pl.DataFrame(pd_df)


static_data = read_file("static_data_ISD_UScomplete.dta")
static_data

In [None]:
static_data.group_by("TYPE").len().sort("len").reverse()

In [None]:
static_data.columns

In [None]:
monthly_returns = read_file("datastream_monthly_stock_data.dta")
monthly_returns

In [None]:
monthly_returns.group_by("dscd").count().sort("count").reverse()

In [None]:
# Select only rows from the first dscd and plot the RI_USD over time
first_dscd = monthly_returns.filter(pl.col("dscd") == "916122").select(
    pl.col("RI_USD")
)

# And plot it over time
first_dscd.plot()

In [None]:
all_merged = read_file("ds_ws_ibes_merge_timely_6months.dta")
all_merged

In [None]:
all_merged.columns

In [None]:
read_file("datastream_monthly_stock_data_unwinsorized_returns.dta")