In [7]:
# Basic data manipulation and numerical operations
import numpy as np
import pandas as pd

# Statistical modeling and analysis
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.stattools import acf, pacf, ccf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tools.eval_measures import aic, bic

# Machine Learning tools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import (train_test_split, 
                                   cross_val_score, 
                                   KFold, 
                                   TimeSeriesSplit,
                                   GridSearchCV,
                                   cross_validate)
from sklearn.metrics import (mean_squared_error, 
                           r2_score, 
                           make_scorer,
                           mean_absolute_error)

# Scientific computing and signal processing
from scipy import signal
from scipy.fft import fft, fftfreq
from scipy.stats import pearsonr
from scipy.interpolate import interp1d

# Visualization (you'll likely need these later)
import matplotlib.pyplot as plt
import seaborn as sns

# Warning control
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Read the data
df = pd.read_csv('../../data-collection/data_interpolated.csv')

# Convert the first column (year) to datetime
# If your years are decimal/float values (e.g., 2020.5), we'll handle that appropriately
df['year'] = pd.to_datetime(df['year'].astype(str).str.split('.').str[0], format='%Y')

# Set year as index for time series analysis
df.set_index('year', inplace=True)

# Sort index to ensure chronological order
df.sort_index(inplace=True)

# Display basic information about the dataset
print("Dataset Overview:")
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nBasic statistics:")
print(df.describe())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 112 entries, 1913-01-01 to 2024-01-01
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   faculty         53 non-null     float64
 1   NSF_awards      13 non-null     float64
 2   inflation_rate  45 non-null     float64
 3   Fed_Budget      42 non-null     float64
 4   PA_Budget_diff  42 non-null     float64
 5   GDP             75 non-null     float64
 6   CPI_inflation   112 non-null    float64
 7   Labor_BS        33 non-null     float64
 8   Labor_cond      33 non-null     float64
 9   Unemploy_BS     33 non-null     float64
 10  Unemploy        77 non-null     float64
dtypes: float64(11)
memory usage: 10.5 KB
None

First few rows:
            faculty  NSF_awards  inflation_rate  Fed_Budget  PA_Budget_diff  \
year                                                                          
1913-01-01      NaN         NaN             NaN    

In [11]:
def find_optimal_lags(df):
    y = df['faculty']
    lags = {}
    x_cols = [col for col in df.columns if col != 'faculty']
    
    for col in x_cols:
        # Get overlapping time periods
        common_dates = df.index[df[col].notna() & y.notna()]
        if len(common_dates) < 2:
            continue
            
        x_series = df.loc[common_dates, col]
        y_series = df.loc[common_dates, 'faculty']
        
        # Calculate correlations for different lags
        max_lag = min(len(common_dates) - 1, 10)
        correlations = []
        
        for lag in range(-max_lag, max_lag + 1):
            if lag > 0:
                corr = pearsonr(y_series[lag:], x_series[:-lag])[0]
            elif lag < 0:
                corr = pearsonr(y_series[:lag], x_series[-lag:])[0]
            else:
                corr = pearsonr(y_series, x_series)[0]
            correlations.append(corr)
            
        optimal_lag = range(-max_lag, max_lag + 1)[np.argmax(np.abs(correlations))]
        lags[col] = optimal_lag
        
    return lags

def create_lagged_df(df, lags):
    new_df = pd.DataFrame({'faculty': df['faculty']})
    
    for col, lag in lags.items():
        if lag > 0:
            new_series = pd.Series(
                df[col].values[:-lag],
                index=df.index[lag:],
                name=col
            )
        else:
            new_series = pd.Series(
                df[col].values[-lag:],
                index=df.index[:lag],
                name=col
            )
        new_df[col] = new_series
        
    return new_df.dropna()

In [13]:
optimal_lags = find_optimal_lags(df)
lagged_df = create_lagged_df(df, optimal_lags)
print(lagged_df)

              faculty  NSF_awards  inflation_rate  Fed_Budget  PA_Budget_diff  \
year                                                                            
2001-01-01  1113183.0     5460.18             4.2     685.174      -29568.623   
2002-01-01  1143388.0     5542.50             3.0   -4165.607      -25885.264   
2003-01-01  1173593.0     5361.64             3.0   -2561.005       -9722.378   
2004-01-01  1232009.5     6159.59             2.6   11326.765        6690.017   
2005-01-01  1290426.0     5678.03             2.8   44151.090       13306.615   
2006-01-01  1331006.5     6826.24             2.9   32034.744        8124.361   
2007-01-01  1371587.0     5905.22             2.3   70656.050       41567.139   
2008-01-01  1405330.5     8129.35             1.5   23441.344          82.744   
2009-01-01  1439074.0     6305.60             2.2   29905.319        6168.066   
2010-01-01  1481771.5     6734.81             3.4   45667.257       13106.644   
2011-01-01  1524469.0     67

In [15]:
lagged_df['faculty'].info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 11 entries, 2001-01-01 to 2011-01-01
Series name: faculty
Non-Null Count  Dtype  
--------------  -----  
11 non-null     float64
dtypes: float64(1)
memory usage: 176.0 bytes
