In [None]:
import requests
import pandas
import numpy
import seaborn
import time
import json
import os
import glob
import matplotlib.pyplot
from datetime import datetime
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
def get_FRED_data(series_id, retries=5):
    API_KEY = 'b4faab7a30a17140d246cce49bbf42ac'
    BASE_URL = 'https://api.stlouisfed.org/fred/series/observations'
    parameters = {
        'series_id': series_id,
        'api_key': API_KEY,
        'file_type': 'json'
    }
    
    for attempt in range(retries):
        response = requests.get(BASE_URL, params=parameters)
        if response.status_code == 200:
            data = response.json()
            
            # Create DataFrame and process data
            df = pandas.DataFrame(data['observations'])[['date', 'value']]
            df['date'] = pandas.to_datetime(df['date'])
            df['value'] = pandas.to_numeric(df['value'], errors='coerce')
            
            # Generate full date range and reindex DataFrame
            full_dates = pandas.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
            df.set_index('date', inplace=True)
            df = df.reindex(full_dates)
            
            # Interpolate missing values
            df[series_id] = df['value'].interpolate(method='linear').values
            df.drop(columns=['value'], inplace=True)
            return df

        elif response.status_code == 429:
            wait_time = 10
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        else:
            print(f"Failed to fetch data for series ID {series_id}. Status code: {response.status_code}")
    
    print(f"Failed to fetch data for series ID {series_id} after {retries} retries.")
    return None

In [None]:
def create_FRED_dataframe(input_data, start_criteria='2000-01-01'):
    df = None
    if isinstance(input_data, dict):
        series_ids = input_data.keys()
    elif isinstance(input_data, str):
        series_ids = [input_data]
    else:
        raise ValueError("Input must be either a dictionary or a string representing a series ID.")
    
    for series_id in series_ids:
        series_info = input_data[series_id]
        observation_start = series_info['observation_start']
        
        if observation_start <= start_criteria:
            data = get_FRED_data(series_id)
            if data is None:
                print(f"Failed to fetch data for series ID {series_id}.")
                continue
            if df is None:
                df = data[data.index >= pandas.to_datetime(start_criteria)]
            else:
                data = data[data.index >= pandas.to_datetime(start_criteria)]
                df = df.merge(data, left_index=True, right_index=True, how='outer')
        else:
            print(f"Skipping series ID {series_id} due to observation start date {observation_start}.")
    
    return df

In [None]:
# Read the quarterly series data from a CSV file
DF = pandas.read_json(r"C:\Users\simeo\OneDrive\Attachments\Projects\Independent\Quarterly_Series.json")

# Standardize the data before applying PCA
DF.dropna(inplace=True)
DF = DF.pct_change() * 100
DF.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
DF.fillna(0, inplace=True)
FRED_Data_scaled = StandardScaler().fit_transform(DF)

# Plot the correlation matrix
correlation_matrix = pandas.DataFrame(FRED_Data_scaled, columns=DF.columns).corr()
seaborn.heatmap(correlation_matrix, annot=False, cmap='coolwarm', xticklabels=True, yticklabels=True)
matplotlib.pyplot.xticks(fontsize=5)
matplotlib.pyplot.yticks(fontsize=5)
matplotlib.pyplot.title('Correlation Matrix')
matplotlib.pyplot.show()

# Perform PCA
pca = PCA(n_components=10)
principal_components = pca.fit_transform(FRED_Data_scaled)

# Create a DataFrame with the principal components
principal_df = pandas.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])

# Plot the histogram of the variance explained by each principal component
explained_variance = pca.explained_variance_ratio_ * 100
matplotlib.pyplot.bar(range(1, len(explained_variance) + 1), explained_variance)
matplotlib.pyplot.xlabel('Principal Component')
matplotlib.pyplot.ylabel('Variance Explained (%)')
matplotlib.pyplot.title('Variance Explained by Principal Components')
matplotlib.pyplot.show()

In [None]:
# Define the number of principal components to plot
K = 3
components = pca.components_[:K]
components_df = pandas.DataFrame(components, columns=DF.columns, index=[f'PC{i+1}' for i in range(K)])

# Plot the principal components as a bar graph
components_df.T.plot(kind='bar', figsize=(14, 8))
matplotlib.pyplot.xlabel('Variables')
matplotlib.pyplot.ylabel('Principal Component Loading')
matplotlib.pyplot.title(f'Principal Component Loadings for First {K} PCs')
matplotlib.pyplot.legend(title='Principal Components')
matplotlib.pyplot.show()

In [None]:
def plot_pca_histograms(df, K):
    # Plot histograms for the first K principal components
    fig, axes = matplotlib.pyplot.subplots(nrows=1, ncols=K, figsize=(14, 6))
    for i in range(K):
        pc = principal_df[f'PC{i+1}']
        axes[i].hist(pc, bins=1000, edgecolor='black')
        axes[i].set_xlabel(f'PC{i+1} Values')
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'Distribution of PC{i+1} Values')

    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.show()

# Example usage:
plot_pca_histograms(principal_df, K)

In [None]:
def create_transition_matrix(principal_df, K):

    # Extract the principal components
    pcs = [principal_df[f'PC{i+1}'] for i in range(K)]
    means = [pc.mean() for pc in pcs]
    stds = [pc.std() for pc in pcs]

    # Define the standard deviation ranges
    std_ranges = [-2, -1, 1, 2]

    # Function to categorize the values into standard deviation ranges
    def categorize_std(value, mean, std):
        if value < mean - std:
            return -2
        elif value <= mean and value >= mean - std:
            return -1
        elif value > mean and value <= mean + std:
            return 1
        elif value > mean + std:
            return 2

    # Categorize the principal component values
    categories = [pc.apply(categorize_std, args=(mean, std)) for pc, mean, std in zip(pcs, means, stds)]

    # Initialize the transition matrix
    num_states = len(std_ranges) ** K
    transition_matrix = numpy.zeros((num_states, num_states))

    # Count the transitions
    for i in range(1, len(categories[0])):
        from_state = tuple(std_ranges.index(categories[j].iloc[i-1]) for j in range(K))
        to_state = tuple(std_ranges.index(categories[j].iloc[i]) for j in range(K))
        from_index = sum(from_state[j] * (len(std_ranges) ** (K - j - 1)) for j in range(K))
        to_index = sum(to_state[j] * (len(std_ranges) ** (K - j - 1)) for j in range(K))
        transition_matrix[from_index, to_index] += 1

    # Normalize to get probabilities
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)


    # Convert to DataFrame for better readability
    index_labels = [f'{std_ranges[(i // (len(std_ranges) ** (K - j - 1))) % len(std_ranges)]}' for i in range(num_states) for j in range(K)]
    index_labels = [f'{" ".join([f"+{label}" if float(label) > 0 else label for label in index_labels[i:i+K]])}' for i in range(0, len(index_labels), K)]
    transition_df = pandas.DataFrame(transition_matrix, index=index_labels, columns=index_labels)
    transition_df.index.name = 'From'
    transition_df.columns.name = 'To'

    # Plot the transition matrix as a heatmap
    matplotlib.pyplot.figure(figsize=(12, 10))
    seaborn.heatmap(transition_df, annot=False, fmt='0.2f', cmap='Greys', cbar=True, xticklabels=True, yticklabels=True, vmin=0, vmax=1)
    matplotlib.pyplot.title('Markov Transition Matrix')
    matplotlib.pyplot.show()

    return transition_df

# Example usage:
transition_df = create_transition_matrix(principal_df, K)
