In [None]:
from matplotlib import pyplot as plt
import statsmodels.api as sm
from statsmodels.multivariate.pca import PCA

plt.style.use('seaborn-v0_8')

In [None]:
data = sm.datasets.fertility.load_pandas().data

print(data.columns)
print(data.head(5))

In [None]:
columns = list(map(str, range(1960, 2012)))
data = data.set_index("Country Name")
dta = data[columns]
dta = dta.dropna()

print(dta.head())

PCA (removes the main trend)
- goal: reduce yearly fertility rate values

In [None]:
# plot main trend for reference
plt.figure(tight_layout=True)
plt.plot(dta.mean())
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Fertility rate")
plt.plot()

In [None]:
# perform PCA
pca_model = PCA(dta.T, standardize=False, demean=True)

In [None]:
# plot eigenvalues
plt.figure(tight_layout=True)
plt.plot(pca_model.eigenvals,
         marker='o',
         linestyle='')
plt.title('First Principal Component dominates')
plt.show()

In [None]:
# plot PC factors
plt.figure(tight_layout=True)
lines = plt.plot(pca_model.factors.iloc[:, :3],
         lw=3, alpha=0.7)
plt.legend(lines, ['PC1', 'PC2', 'PC3'])
plt.xticks(rotation=90)
plt.show()

In [None]:
# find matching trend lines for each country with PC factors
idx_1 = pca_model.loadings.iloc[:, 0].argsort()
labels_1 = dta.index[idx_1[-5:]]
print('PC1:', labels_1.values)

idx_2 = pca_model.loadings.iloc[:, 1].argsort()
labels_2 = pca_model.loadings.index[idx_2[-5:]]
print('PC2', labels_2.values)

idx_3 = pca_model.loadings.iloc[:, 2].argsort()
labels_3 = pca_model.loadings.index[idx_3[-5:]]
print('PC3', labels_3.values)

In [None]:
fig, ax = plt.subplots(
    nrows=3, ncols=1, figsize=[10, 10],
    tight_layout=True, sharex=True, sharey=True)

ax[0].plot(dta.loc[labels_1].T)
ax[1].plot(dta.loc[labels_2].T)  
ax[2].plot(dta.loc[labels_3].T)

for axs in ax.flat:
    axs.plot(dta.mean())
    
ax[0].set_title(', '.join(labels_1.values))
ax[1].set_title(', '.join(labels_2.values))
ax[2].set_title(', '.join(labels_3.values))

plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure()
plt.scatter(pca_model.loadings.comp_00.values,
         pca_model.loadings.comp_01.values,)
plt.xlabel('PC1')
plt.ylabel('PC2')
outliers = dta.index[pca_model.loadings.iloc[:, 1] > 0.2].values
plt.title('PC2 > 0.2: ' + ', '.join(outliers))
plt.show()
