In [1]:
# Basic data manipulation and numerical operations
import numpy as np
import pandas as pd

# Statistical modeling and analysis
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.stattools import acf, pacf, ccf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tools.eval_measures import aic, bic

# Machine Learning tools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import (train_test_split, 
                                   cross_val_score, 
                                   KFold, 
                                   TimeSeriesSplit,
                                   GridSearchCV,
                                   cross_validate)
from sklearn.metrics import (mean_squared_error, 
                           r2_score, 
                           make_scorer,
                           mean_absolute_error)

# Scientific computing and signal processing
from scipy import signal
from scipy.fft import fft, fftfreq
from scipy.stats import pearsonr
from scipy.interpolate import interp1d

# Visualization (you'll likely need these later)
import matplotlib.pyplot as plt
import seaborn as sns

# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read the data
df = pd.read_csv('../../data-collection/data_interpolated.csv')

# Convert the first column (year) to datetime
# If your years are decimal/float values (e.g., 2020.5), we'll handle that appropriately
df['year'] = pd.to_datetime(df['year'].astype(str).str.split('.').str[0], format='%Y')

# Set year as index for time series analysis
df.set_index('year', inplace=True)

# Sort index to ensure chronological order
df.sort_index(inplace=True)

# Display basic information about the dataset
print("Dataset Overview:")
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nBasic statistics:")
print(df.describe())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 112 entries, 1913-01-01 to 2024-01-01
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   faculty         53 non-null     float64
 1   NSF_awards      13 non-null     float64
 2   inflation_rate  45 non-null     float64
 3   Fed_Budget      42 non-null     float64
 4   PA_Budget_diff  42 non-null     float64
 5   GDP             75 non-null     float64
 6   CPI_inflation   112 non-null    float64
 7   Labor_BS        33 non-null     float64
 8   Labor_cond      33 non-null     float64
 9   Unemploy_BS     33 non-null     float64
 10  Unemploy        77 non-null     float64
dtypes: float64(11)
memory usage: 10.5 KB
None

First few rows:
            faculty  NSF_awards  inflation_rate  Fed_Budget  PA_Budget_diff  \
year                                                                          
1913-01-01      NaN         NaN             NaN    

In [6]:
def find_optimal_lags(df):
    y = df['faculty']
    y_start = y.first_valid_index()
    y_end = y.last_valid_index()
    lags = {}
    x_cols = [col for col in df.columns if col != 'faculty']
    
    for col in x_cols:
        x = df[col]
        x_start = x.first_valid_index()
        x_end = x.last_valid_index()
        
        # Calculate the maximum possible lag range
        max_lag = min(15, (y_end - x_start).days // 365)  # Limit to 15 years or available range
        min_lag = max(-15, (y_start - x_end).days // 365)  # Negative lag means X leads Y
        
        correlations = []
        lag_values = range(min_lag, max_lag + 1)
        
        for lag in lag_values:
            # For each lag, align the series
            x_shifted = x.shift(lag)
            # Get overlapping period
            mask = y.notna() & x_shifted.notna()
            if mask.sum() < 2:  # Need at least 2 points for correlation
                correlations.append(-np.inf)
                continue
            
            corr = pearsonr(y[mask], x_shifted[mask])[0]
            correlations.append(corr)
        
        optimal_lag = lag_values[np.argmax(np.abs(correlations))]
        lags[col] = optimal_lag
        
    return lags

def create_lagged_df(df, lags):
    y = df['faculty']
    y_start = y.first_valid_index()
    y_end = y.last_valid_index()
    
    # Initialize new dataframe with Y
    new_df = pd.DataFrame({'faculty': y})
    
    # Add each X variable with its optimal lag
    for col, lag in lags.items():
        x = df[col]
        x_shifted = x.shift(lag)
        
        # Get the valid range for this X variable after shifting
        x_valid_start = x_shifted.first_valid_index()
        x_valid_end = x_shifted.last_valid_index()
        
        # Add the shifted series to the new dataframe
        new_df[col] = x_shifted
        
        # Print information about the shift
        print(f"\nVariable: {col}")
        print(f"Optimal lag: {lag} years")
        print(f"Original X range: {x.first_valid_index()} to {x.last_valid_index()}")
        print(f"Shifted X range: {x_valid_start} to {x_valid_end}")
        
    print(f"\nY range: {y_start} to {y_end}")
    
    return new_df

# Example usage:
optimal_lags = find_optimal_lags(df)
lagged_df = create_lagged_df(df, optimal_lags)

# Print summary of what was done
print("\nOptimal lags found:")
for var, lag in optimal_lags.items():
    print(f"{var}: {lag} years")


Variable: NSF_awards
Optimal lag: 11 years
Original X range: 2011-01-01 00:00:00 to 2023-01-01 00:00:00
Shifted X range: 2022-01-01 00:00:00 to 2024-01-01 00:00:00

Variable: inflation_rate
Optimal lag: 15 years
Original X range: 1980-01-01 00:00:00 to 2024-01-01 00:00:00
Shifted X range: 1995-01-01 00:00:00 to 2024-01-01 00:00:00

Variable: Fed_Budget
Optimal lag: -10 years
Original X range: 1980-01-01 00:00:00 to 2021-01-01 00:00:00
Shifted X range: 1970-01-01 00:00:00 to 2011-01-01 00:00:00

Variable: PA_Budget_diff
Optimal lag: -13 years
Original X range: 1980-01-01 00:00:00 to 2021-01-01 00:00:00
Shifted X range: 1967-01-01 00:00:00 to 2008-01-01 00:00:00

Variable: GDP
Optimal lag: -13 years
Original X range: 1950-01-01 00:00:00 to 2024-01-01 00:00:00
Shifted X range: 1937-01-01 00:00:00 to 2011-01-01 00:00:00

Variable: CPI_inflation
Optimal lag: -15 years
Original X range: 1913-01-01 00:00:00 to 2024-01-01 00:00:00
Shifted X range: 1913-01-01 00:00:00 to 2009-01-01 00:00:00

V

In [7]:
lagged_df

Unnamed: 0_level_0,faculty,NSF_awards,inflation_rate,Fed_Budget,PA_Budget_diff,GDP,CPI_inflation,Labor_BS,Labor_cond,Unemploy_BS,Unemploy
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1913-01-01,,,,,,,17.158333,,,,
1914-01-01,,,,,,,17.158333,,,,
1915-01-01,,,,,,,16.700000,,,,
1916-01-01,,,,,,,15.208333,,,,
1917-01-01,,,,,,,13.641667,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01,1489565.0,,3.4,,,,,,0.339063,2.316667,
2021-01-01,1499236.0,,3.2,,,,,,0.607713,2.033333,
2022-01-01,1507641.0,5460.18,2.9,,,,,,0.462120,2.016667,
2023-01-01,,5542.50,3.8,,,,,,-0.413846,2.566667,


In [5]:
lagged_df['faculty'].info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 11 entries, 2001-01-01 to 2011-01-01
Series name: faculty
Non-Null Count  Dtype  
--------------  -----  
11 non-null     float64
dtypes: float64(1)
memory usage: 176.0 bytes
