In [1]:
import pandas as pd
import glob
import re
import os

In [2]:
# Folder containing JPM Excel files
DATA_DIR = "./raw_data/"

# Grab all JPM_YYYYQQ.xlsx files
files = sorted(glob.glob(os.path.join(DATA_DIR, "JPM_*.xls")))

print(f"Found {len(files)} files")

Found 35 files


In [11]:
def extract_year_quarter(filename):
    match = re.search(r"(\d{4})Q([1-4])", filename)
    if not match:
        raise ValueError(f"Cannot parse year/quarter from {filename}")

    year = int(match.group(1))
    quarter = int(match.group(2))

    return year, quarter

In [54]:
all_data = []

In [50]:
def get_sheet_name(year, quarter):
    # 2010Q3 is the cutoff
    # if year in (2011, 2012, 2013):
    #     return "consolidated financial hig"
    if (year < 2010) or (year == 2010 and quarter < 3):
        return "jpmorgan chase  co consoli-1"
    else:
        return "consolidated financial hig"

def get_skip_rows(year, quarter):
    # if year in (2011, 2012, 2013):
    #     return 6
    if (year < 2010) or (year == 2010 and quarter < 3):
        return 5
    else:
        return 6

In [51]:
metrics_to_keep = [
    "Net charge-off rate",
    "Consumer net charge-off rate",
    "Net charge-offs",
    "Nonperforming assets"
]

In [47]:
df = pd.read_excel(
            "./raw_data/JPM_2013Q1.xls",
            sheet_name="consolidated financial hig",
            skiprows=6  # adjust if needed after checking
        )
df.rename(columns={df.columns[0]: "Metric"}, inplace=True)
# Add time identifiers
df["Year"] = 2013
df["Quarter"] = 1
df["Source"] = "JPM 10-Q"

df_filtered = df[df["Metric"].isin(metrics_to_keep)]
df_long = df_filtered.melt(id_vars="Metric",var_name="Quarter",value_name="Value")
df_long = df_long[df_long['Quarter'].str[:7] != "Unnamed"]
df_long

Unnamed: 0,Metric,Quarter,Value
0,Nonperforming assets,1Q13,11584
1,Net charge-offs,1Q13,1725
2,Net charge-off rate,1Q13,0.97%
9,Nonperforming assets,4Q12,11734
10,Net charge-offs,4Q12,1628
11,Net charge-off rate,4Q12,0.90%
18,Nonperforming assets,3Q12,12481
19,Net charge-offs,3Q12,2770
20,Net charge-off rate,3Q12,1.53%
27,Nonperforming assets,2Q12,11397


In [55]:
for file in files:
    year, quarter = extract_year_quarter(file)

    try:
        df = pd.read_excel(
            file,
            sheet_name=get_sheet_name(year, quarter),
            skiprows=get_skip_rows(year, quarter)  # adjust if needed after checking
        )

        # # Keep only relevant columns
        # df = df[COLUMNS_TO_KEEP].copy()

        df.rename(columns={df.columns[0]: "Metric"}, inplace=True)

        # Add time identifiers
        df["Year"] = year
        df["Quarter"] = quarter
        df["Source"] = "JPM 10-Q"

        df_filtered = df[df["Metric"].isin(metrics_to_keep)]
        df_long = df_filtered.melt(id_vars="Metric",var_name="Quarter",value_name="Value")
        df_long = df_long[df_long['Quarter'].str[:7] != "Unnamed"]

        all_data.append(df_long)

    except Exception as e:
        print(f"Error processing {file}: {e}")

In [56]:
jpm_df = pd.concat(all_data, ignore_index=True)

jpm_df

Unnamed: 0,Metric,Quarter,Value
0,Nonperforming assets,1Q10,19019.0
1,Net charge-offs,1Q10,
2,Net charge-off rate,1Q10,
3,Consumer net charge-off rate,1Q10,
4,Nonperforming assets,4Q09,19741.0
...,...,...,...
684,Net charge-offs,Quarter,2
685,Net charge-off rate,Quarter,2
686,Nonperforming assets,Source,JPM 10-Q
687,Net charge-offs,Source,JPM 10-Q


In [57]:
jpm_df["Quarter_Num"] = jpm_df["Quarter"].str.replace("Q", "").astype(int)

# Create a proper date column
jpm_df["Date"] = pd.PeriodIndex(
    year=jpm_df["Year"],
    quarter=jpm_df["Quarter_Num"],
    freq="Q"
).to_timestamp()

ValueError: invalid literal for int() with base 10: 'Year'

In [None]:
OUTPUT_PATH = "/output/jpm_retail_portfolio_raw.csv"

jpm_df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved cleaned JPM data to {OUTPUT_PATH}")