In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import signal
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
from fredapi import Fred
from datetime import datetime, timedelta


# Set plotting style
sns.set_theme(style="whitegrid")

In [6]:
# Initialize FRED API
fred = Fred(api_key='25dd49efa9d911a80f6535b364acdbff')

# # Define economic indicators to fetch
indicators = {
    'GDP': 'Gross Domestic Product',
    'UNRATE': 'Unemployment Rate',
    'FEDFUNDS': 'Federal Funds Rate',
    'CPIAUCSL': 'Consumer Price Index',
    'GS10': '10-Year Treasury Rate'
}

# Fetch data from FRED
start_date = '1960-01-01'
end_date = '2024-01-01'

fred_data = pd.DataFrame()
for series_id in indicators.keys():
    fred_data[series_id] = fred.get_series(series_id, start_date, end_date)
    # fred_data[series_id] = fred.get_series_all_releases(series_id)

fred_data.head()

Unnamed: 0,GDP,UNRATE,FEDFUNDS,CPIAUCSL,GS10
1960-01-01,542.648,5.2,3.99,29.37,4.72
1960-04-01,541.08,5.2,3.92,29.54,4.28
1960-07-01,545.604,5.5,3.23,29.55,3.9
1960-10-01,540.197,6.1,2.47,29.75,3.89
1961-01-01,545.018,6.6,1.45,29.84,3.84


In [7]:
# Load faculty hiring data (replace this with your actual data)
faculty_data = pd.read_excel('../../data-collection/NCES_data/cleaned_1970-2022.xlsx', index_col='Year', parse_dates=True)
faculty_data['Faculty'] = faculty_data['Full-time'].astype(int)
faculty_data['new_hires'] = faculty_data['Faculty'].diff()
faculty_data['new_hires'] = faculty_data['new_hires'].dropna() # adjust column name as needed
# Merge datasets
data = pd.concat([fred_data, faculty_changes], axis=1).resample('M').last()
# data = data.fillna(method='ffill')
data = data.dropna()

print("Data shape:", data.shape)
data.head()

ValueError: 'Year' is not in list (sheet: 0)

In [None]:
# Function to apply seasonal adjustment
def apply_seasonal_adjustment(series):
    try:
        decomposition = seasonal_decompose(series, period=12, model='multiplicative')
        return decomposition.trend
    except:
        return series

# Apply seasonal adjustment to all columns
adjusted_data = pd.DataFrame()
for column in data.columns:
    adjusted_data[f"{column}_adjusted"] = apply_seasonal_adjustment(data[column])

print("Columns after adjustment:", adjusted_data.columns.tolist())
adjusted_data.head()

In [None]:
# Compute Spearman correlations
corr_matrix = adjusted_data.corr(method='spearman')

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Spearman Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
def compute_cross_correlations(data, target_col, max_lags=24):
    cross_corrs = {}
    for column in data.columns:
        if column != target_col:
            ccf = signal.correlate(data[target_col],
                                 data[column],
                                 mode='full') / len(data)
            lags = np.arange(-max_lags, max_lags + 1)
            mid_point = len(ccf) // 2
            cross_corrs[column] = pd.Series(
                ccf[mid_point-max_lags:mid_point+max_lags+1],
                index=lags
            )
    return pd.DataFrame(cross_corrs)

# Compute and plot cross-correlations
target_col = 'faculty_changes_adjusted'
cross_corrs = compute_cross_correlations(adjusted_data, target_col)

plt.figure(figsize=(12, 6))
cross_corrs.plot()
plt.title('Cross-correlations with Faculty Changes')
plt.xlabel('Lag (months)')
plt.ylabel('Correlation')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Dimension Reduction with PCA

# Prepare data for PCA
X = adjusted_data.drop(columns=[target_col])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.95)  # Explain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame with PCA results
pca_cols = [f'PC{i+1}' for i in range(X_pca.shape[1])]
pca_df = pd.DataFrame(X_pca, columns=pca_cols, index=adjusted_data.index)

# Plot explained variance ratio
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Prepare data for LASSO (Feature selection)
y = adjusted_data[target_col]

# Fit LASSO
lasso = Lasso(alpha=1.0)
lasso.fit(X_scaled, y)

# Get feature importance
feature_importance = pd.Series(
    np.abs(lasso.coef_),
    index=X.columns
).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
feature_importance.plot(kind='bar')
plt.title('LASSO Feature Importance')
plt.xlabel('Features')
plt.ylabel('Absolute Coefficient Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Prepare data for GP
X_selected = X_pca  # Use PCA components
y_selected = y

# Setup GP model
kernel = RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=kernel, random_state=42)

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []
predictions = []
actual_values = []

for train_idx, test_idx in tscv.split(X_selected):
    X_train, X_test = X_selected[train_idx], X_selected[test_idx]
    y_train, y_test = y_selected.iloc[train_idx], y_selected.iloc[test_idx]
    
    gpr.fit(X_train, y_train)
    y_pred, y_std = gpr.predict(X_test, return_std=True)
    
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    cv_scores.append(rmse)
    
    predictions.extend(y_pred)
    actual_values.extend(y_test)

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(actual_values, label='Actual')
plt.plot(predictions, label='Predicted')
plt.title('Gaussian Process Regression Results')
plt.xlabel('Time')
plt.ylabel('Faculty Changes')
plt.legend()
plt.tight_layout()
plt.show()

print("Cross-validation RMSE scores:", cv_scores)
print("Mean RMSE:", np.mean(cv_scores))

In [None]:
# Create summary array
summary = pd.DataFrame({
    'CV_RMSE': cv_scores,
    'Mean_RMSE': np.mean(cv_scores),
    'Std_RMSE': np.std(cv_scores)
}, index=[f'Fold_{i+1}' for i in range(len(cv_scores))])

# Add overall statistics
summary.loc['Average'] = summary.mean()

print("Model Performance Summary")
print("------------------------")
print(summary)

# Save results
summary.to_csv('model_performance_summary.csv')

SyntaxError: invalid syntax (652143441.py, line 2)