In [1]:
import pandas as pd
import glob
import re
import os

In [26]:
# Folder containing JPM Excel files
DATA_DIR = "./raw_data/"

# Grab all JPM_YYYYQQ.xlsx files
files = sorted(glob.glob(os.path.join(DATA_DIR, "JPM_*.xls")))

print(f"Found {len(files)} files")

Found 16 files


In [27]:
def extract_year_quarter(filename):
    match = re.search(r"(\d{4})Q([1-4])", filename)
    if not match:
        raise ValueError(f"Cannot parse year/quarter from {filename}")

    year = int(match.group(1))
    quarter = int(match.group(2))

    return year, quarter

In [28]:
all_data = []

In [29]:
# def get_sheet_name(year, quarter):
#     # 2010Q3 is the cutoff
#     # if year in (2011, 2012, 2013):
#     #     return "consolidated financial hig"
#     if (year < 2010) or (year == 2010 and quarter < 3):
#         return "jpmorgan chase  co consoli-1"
#     else:
#         return "consolidated financial hig"

def get_skip_rows(year, quarter):
    # if year in (2011, 2012, 2013):
    #     return 6
    if (year < 2020):
        return 6
    elif (year > 2023):
        return 4
    else:
        return 5

In [32]:
metrics_to_keep = [
    "Total net revenue",
    "Total noninterest expense",
    "Pre-provision profit",
    "Provision for credit losses",
    "Income before income tax expense",
    "Income tax expense",
    "Net income",
    "Common equity Tier 1 (“CET1”) capital ratio",
    "Tier 1 common capital ratio",
    "Tier 1 capital ratio",
    "Total capital ratio",
    "Tier 1 leverage ratio",
    "Loans",
    "Total assets",
    "Deposits",
    "Long-term debt",
    "Common stockholders’ equity",
    "Allowance for credit losses",
    "Allowance for loan losses to total retained loans",
    "Net charge-off rate",
    "Consumer net charge-off rate",
    "Net charge-offs",
    "Nonperforming assets"
]

In [33]:
# df = pd.read_excel(
#             "./raw_data/JPM_2010Q1.xls",
#             sheet_name="consolidated financial hig",
#             skiprows=6  # adjust if needed after checking
#         )
# df.rename(columns={df.columns[0]: "Metric"}, inplace=True)
# # Add time identifiers
# df["Year"] = 2010
# df["Quarter"] = 1
# df["Source"] = "JPM 10-Q"

# df_filtered = df[df["Metric"].isin(metrics_to_keep)]
# # df_long = df_filtered.melt(id_vars="Metric",var_name="Quarter",value_name="Value")
# # df_filtered = df_filtered[df_filtered['Quarter'].str[:7] != "Unnamed"]
# df_filtered = df_filtered[df_filtered.columns.drop(list(df.filter(regex='Unnamed:')))]
# df_filtered

In [34]:
for file in files:
    year, quarter = extract_year_quarter(file)

    try:
        df = pd.read_excel(
            file,
            sheet_name="consolidated financial hig",
            skiprows=get_skip_rows(year, quarter)  # adjust if needed after checking
        )

        # # Keep only relevant columns
        # df = df[COLUMNS_TO_KEEP].copy()

        df.rename(columns={df.columns[0]: "Metric"}, inplace=True)

        # Add time identifiers
        # df["Year"] = year
        # df["Quarter"] = quarter
        # df["Source"] = "JPM 10-Q"

        df_filtered = df[df["Metric"].isin(metrics_to_keep)]
        df_long = df_filtered.melt(id_vars="Metric",var_name="Quarter",value_name="Value")
        df_long = df_long[df_long['Quarter'].str[:7] != "Unnamed"]
        # df_filtered = df_filtered[df_filtered.columns.drop(list(df.filter(regex='Unnamed:')))]

        all_data.append(df_long)
        # merged_df = pd.merge(df_filtered_1, df_filtered, on='Metric', how='left')

    except Exception as e:
        print(f"Error processing {file}: {e}")

In [35]:
jpm_df = pd.concat(all_data, ignore_index=True)
jpm_df = jpm_df[jpm_df['Quarter'].str[1] == "Q"]
jpm_df = jpm_df.drop_duplicates(subset=["Metric", "Quarter"], keep="first")
jpm_df
# merged_df

Unnamed: 0,Metric,Quarter,Value
0,Total net revenue,1Q10,27671
1,Total noninterest expense,1Q10,16124
2,Pre-provision profit,1Q10,11547
3,Provision for credit losses,1Q10,7010
4,Income tax expense,1Q10,1211
...,...,...,...
1814,Common stockholders’ equity,3Q24,324186
1815,Allowance for loan losses to total retained loans,3Q24,1.86%
1816,Nonperforming assets,3Q24,8628
1817,Net charge-offs,3Q24,2087


In [36]:
OUTPUT_PATH = "./output/jpm_retail_portfolio_raw.csv"

jpm_df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved cleaned JPM data to {OUTPUT_PATH}")

Saved cleaned JPM data to ./output/jpm_retail_portfolio_raw.csv
