In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# Load data
df = pd.read_csv("./output/jpm_retail_portfolio_raw.csv")

# Convert Quarter/Year to datetime
df['Quarter_dt'] = pd.PeriodIndex(df['Quarter'], freq='Q').to_timestamp()

# Check duplicates
duplicates = df[df.duplicated(subset=['Quarter_dt', 'Metric'], keep=False)]

print("Number of duplicate rows:", len(duplicates))
print(duplicates.sort_values(['Quarter_dt', 'Metric']))

Number of duplicate rows: 0
Empty DataFrame
Columns: [Metric, Quarter, Value, Quarter_dt]
Index: []


In [None]:


# Pivot to wide format
df_wide = df.pivot(index='Quarter_dt', columns='Metric', values='Value')

# Sort
df_wide = df_wide.sort_index()

# Basic check
print(df_wide.head())

                                                 Metric Quarter   Value  \
0                                     Total net revenue    1Q10   27671   
1                             Total noninterest expense    1Q10   16124   
2                           Provision for credit losses    1Q10    7010   
3                                    Income tax expense    1Q10    1211   
4                                            Net income    1Q10    3326   
...                                                 ...     ...     ...   
1046                        Common stockholdersâ€™ equity    3Q24  324186   
1047  Allowance for loan losses to total retained loans    3Q24   1.86%   
1048                               Nonperforming assets    3Q24    8628   
1049                                    Net charge-offs    3Q24    2087   
1050                                Net charge-off rate    3Q24   0.65%   

     Quarter_dt  
0    2010-01-01  
1    2010-01-01  
2    2010-01-01  
3    2010-01-01  
4    20

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
## Summary statistics

summary_stats = df_wide.describe().T
summary_stats['skew'] = df_wide.skew()
summary_stats['kurtosis'] = df_wide.kurtosis()

print(summary_stats)

In [None]:
## Time series plots

plt.figure(figsize=(14,8))

for col in df_wide.columns:
    plt.plot(df_wide.index, df_wide[col], label=col)

plt.legend()
plt.title("JPM Portfolio Metrics Over Time")
plt.xlabel("Quarter")
plt.ylabel("Value")
plt.show()

In [None]:
## Individual metric plots

for col in df_wide.columns:
    plt.figure(figsize=(10,5))
    plt.plot(df_wide.index, df_wide[col])
    plt.title(f"{col} Over Time")
    plt.xlabel("Quarter")
    plt.ylabel(col)
    plt.show()

In [None]:
## Distribution Plots

for col in df_wide.columns:
    plt.figure(figsize=(8,4))
    sns.histplot(df_wide[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
## Correlation matrix

corr_matrix = df_wide.corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
## Lag creation

lags = [1, 2, 4]

for col in df_wide.columns:
    for lag in lags:
        df_wide[f"{col}_lag{lag}"] = df_wide[col].shift(lag)

df_wide_lagged = df_wide.dropna()

In [None]:
## QoQ changes

df_growth = df_wide.pct_change()

plt.figure(figsize=(12,6))
for col in df_growth.columns:
    plt.plot(df_growth.index, df_growth[col], label=col)

plt.legend()
plt.title("Quarter-over-Quarter % Change")
plt.show()