Required Libraries

In [1]:
import requests
import pandas
import numpy
import seaborn
import time
import json
import glob
import os
import matplotlib.pyplot
from tqdm import tqdm
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
def plot_cumulative_distribution(data_dict):
    start_dates = [datetime.strptime(info['observation_start'], '%Y-%m-%d') for info in data_dict.values()]
    popularity_scores = [info['popularity'] for info in data_dict.values()]
    
    start_dates_df = pandas.DataFrame({'start_date': start_dates, 'popularity': popularity_scores})
    full_date_range = pandas.date_range(start_dates_df['start_date'].min(), start_dates_df['start_date'].max())
    full_date_df = pandas.DataFrame(full_date_range, columns=['date'])
    
    total_series = len(start_dates_df)
    full_date_df['cumulative_count'] = full_date_df['date'].apply(lambda x: (start_dates_df['start_date'] <= x).sum())
    full_date_df['cumulative_percentage'] = (full_date_df['cumulative_count'] / total_series) * 100
    full_date_df['average_popularity'] = full_date_df['date'].apply(lambda x: start_dates_df[start_dates_df['start_date'] <= x]['popularity'].mean())
    
    max_average_popularity = full_date_df['average_popularity'].max()
    full_date_df['average_popularity_percentage'] = (full_date_df['average_popularity'] / max_average_popularity) * 100
    
    matplotlib.pyplot.figure(figsize=(14, 7))
    matplotlib.pyplot.plot(full_date_df['date'], full_date_df['cumulative_percentage'], label='Cumulative Percentage')
    matplotlib.pyplot.plot(full_date_df['date'], full_date_df['average_popularity_percentage'], label='Average Popularity (%)', color='orange')
    matplotlib.pyplot.xlabel('Date')
    matplotlib.pyplot.ylabel('Cumulative Percentage / Average Popularity (%)')
    matplotlib.pyplot.title('Cumulative Distribution and Average Popularity')
    matplotlib.pyplot.legend()
    matplotlib.pyplot.grid(True)
    matplotlib.pyplot.show()

https://fred.stlouisfed.org/categories

Get FRED Series

In [None]:
def process_json_file(json_file, start_criteria='2000-01-01', end_criteria='2024-01-01'):
    with open(json_file, 'r') as file:
        data_dict = json.load(file)
    
    # Filter the series based on the criteria
    filtered_series = {k: v for k, v in data_dict.items() if (v['observation_start'] <= start_criteria) and (v['observation_end'] >= end_criteria)}
    
    # Create the FRED dataframe
    DF = create_FRED_dataframe(filtered_series, start_date=start_criteria)
    
    if DF is not None:
        # Save the dataframe to a CSV file
        output_csv_path = os.path.splitext(json_file)[0] + '_Dataframe.csv'
        DF.to_csv(output_csv_path, index=True)
        print(f"Processed {json_file} and saved to {output_csv_path}")
    else:
        print(f"Failed to create dataframe for {json_file}")

In [None]:
file_path= r"C:\Users\User\OneDrive\Desktop\UNC\Fall 2024\BUSI 880\Project\Daily_Series.json"
process_json_file(file_path, start_criteria='2000-01-01', end_criteria='2023-12-29')file_path= r"C:\Users\User\OneDrive\Desktop\UNC\Fall 2024\BUSI 880\Project\Quarterly_Series.json"
process_json_file(file_path)
file_path= r"C:\Users\User\OneDrive\Desktop\UNC\Fall 2024\BUSI 880\Project\Monthly_Series.json"
process_json_file(file_path)
file_path= r"C:\Users\User\OneDrive\Desktop\UNC\Fall 2024\BUSI 880\Project\Annual_Series.json"
process_json_file(file_path)

Categories of Data

In [None]:
# Standardize the data before applying PCA
FRED_Data = create_FRED_dataframe(Daily_Data).pct_change()
FRED_Data.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
FRED_Data_scaled = StandardScaler().fit_transform(FRED_Data.dropna())

# Plot the correlation matrix
correlation_matrix = pandas.DataFrame(FRED_Data_scaled, columns=FRED_Data.columns).corr()
seaborn.heatmap(correlation_matrix, annot=False, cmap='coolwarm', xticklabels=True, yticklabels=True)
matplotlib.pyplot.xticks(fontsize=5)
matplotlib.pyplot.yticks(fontsize=5)
matplotlib.pyplot.title('Correlation Matrix')
matplotlib.pyplot.show()

# Perform PCA
pca = PCA(n_components=10)
principal_components = pca.fit_transform(FRED_Data_scaled)

# Create a DataFrame with the principal components
principal_df = pandas.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])

# Plot the histogram of the variance explained by each principal component
explained_variance = pca.explained_variance_ratio_ * 100
matplotlib.pyplot.bar(range(1, len(explained_variance) + 1), explained_variance)
matplotlib.pyplot.xlabel('Principal Component')
matplotlib.pyplot.ylabel('Variance Explained (%)')
matplotlib.pyplot.title('Variance Explained by Principal Components')
matplotlib.pyplot.show()

In [None]:
# Define the number of principal components to plot
K = 3
components = pca.components_[:K]
components_df = pandas.DataFrame(components, columns=FRED_Data.columns, index=[f'PC{i+1}' for i in range(K)])

# Plot the principal components as a bar graph
components_df.T.plot(kind='bar', figsize=(14, 8))
matplotlib.pyplot.xlabel('Variables')
matplotlib.pyplot.ylabel('Principal Component Loading')
matplotlib.pyplot.title(f'Principal Component Loadings for First {K} PCs')
matplotlib.pyplot.legend(title='Principal Components')
matplotlib.pyplot.show()

In [None]:
def plot_pca_histograms(df, K):

    # Standardize the data before applying PCA
    df.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
    df_scaled = StandardScaler().fit_transform(df.dropna())

    # Perform PCA
    pca = PCA(n_components=K)
    principal_components = pca.fit_transform(df_scaled)

    # Create a DataFrame with the principal components
    principal_df = pandas.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(K)])

    # Plot histograms for the first K principal components
    fig, axes = matplotlib.pyplot.subplots(nrows=1, ncols=K, figsize=(14, 6))
    for i in range(K):
        pc = principal_df[f'PC{i+1}']
        axes[i].hist(pc, bins=1000, edgecolor='black')
        axes[i].set_xlabel(f'PC{i+1} Values')
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'Distribution of PC{i+1} Values')

    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.show()

# Example usage:
plot_pca_histograms(FRED_Data, K)

In [None]:
def create_transition_matrix(principal_df, K):

    # Extract the principal components
    pcs = [principal_df[f'PC{i+1}'] for i in range(K)]
    means = [pc.mean() for pc in pcs]
    stds = [pc.std() for pc in pcs]

    # Define the standard deviation ranges
    std_ranges = [-2, -1, 1, 2]

    # Function to categorize the values into standard deviation ranges
    def categorize_std(value, mean, std):
        if value < mean - std:
            return -2
        elif value <= mean and value >= mean - std:
            return -1
        elif value > mean and value <= mean + std:
            return 1
        elif value > mean + std:
            return 2

    # Categorize the principal component values
    categories = [pc.apply(categorize_std, args=(mean, std)) for pc, mean, std in zip(pcs, means, stds)]

    # Initialize the transition matrix
    num_states = len(std_ranges) ** K
    transition_matrix = numpy.zeros((num_states, num_states))

    # Count the transitions
    for i in range(1, len(categories[0])):
        from_state = tuple(std_ranges.index(categories[j].iloc[i-1]) for j in range(K))
        to_state = tuple(std_ranges.index(categories[j].iloc[i]) for j in range(K))
        from_index = sum(from_state[j] * (len(std_ranges) ** (K - j - 1)) for j in range(K))
        to_index = sum(to_state[j] * (len(std_ranges) ** (K - j - 1)) for j in range(K))
        transition_matrix[from_index, to_index] += 1

    # Normalize to get probabilities
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)

    # Convert to DataFrame for better readability
    index_labels = [f'{std_ranges[(i // (len(std_ranges) ** (K - j - 1))) % len(std_ranges)]}' for i in range(num_states) for j in range(K)]
    index_labels = [f'{" ".join([f"+{label}" if float(label) > 0 else label for label in index_labels[i:i+K]])}' for i in range(0, len(index_labels), K)]
    transition_df = pandas.DataFrame(transition_matrix, index=index_labels, columns=index_labels)
    transition_df.index.name = 'From'
    transition_df.columns.name = 'To'

    # Plot the transition matrix as a heatmap
    matplotlib.pyplot.figure(figsize=(12, 10))
    seaborn.heatmap(transition_df, annot=False, fmt='0.2f', cmap='Greys', cbar=True, xticklabels=True, yticklabels=True, vmin=0, vmax=1)
    matplotlib.pyplot.title('Markov Transition Matrix')
    matplotlib.pyplot.show()

    return transition_df

# Example usage:
transition_df = create_transition_matrix(principal_df, K)