In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.arima.model import ARIMA
from scipy import stats

# Load the data
df = pd.read_excel('faculty_data.xlsx', header=3, usecols=[0, 2], names=['Year', 'Faculty'])
df.set_index('Year', inplace=True)

# Calculate year-over-year change
df['Change'] = df['Faculty'].diff()

# Perform STL decomposition
stl = STL(df['Faculty'], period=5)  # Assuming a 5-year cycle, adjust as needed
result = stl.fit()

# Extract components
df['Trend'] = result.trend
df['Seasonal'] = result.seasonal
df['Residual'] = result.resid

# Fit ARIMA model to the residuals
model = ARIMA(df['Residual'], order=(1,0,1))  # Adjust order as needed
model_fit = model.fit()

# Forecast residuals for 2023 and 2024
forecast_residuals = model_fit.forecast(steps=2)

# Extrapolate trend for 2023 and 2024
last_trend = df['Trend'].iloc[-1]
trend_change = df['Trend'].diff().mean()
forecast_trend = pd.Series([last_trend + trend_change, last_trend + 2*trend_change], index=[2023, 2024])

# Estimate seasonality for 2023 and 2024
last_years_seasonality = df['Seasonal'].iloc[-5:]  # Last 5 years
forecast_seasonal = pd.Series(last_years_seasonality.values[-2:], index=[2023, 2024])

# Combine components for final forecast
forecast = forecast_trend + forecast_seasonal + forecast_residuals

# Calculate prediction intervals
residual_std = df['Residual'].std()
z_score = stats.norm.ppf(0.975)  # 95% confidence interval
prediction_interval = z_score * residual_std * np.sqrt(np.arange(1, 3))

# Print results
print("Predicted number of full-time faculty in 2024:", round(forecast[2024]))
print("95% Prediction Interval:", 
      round(forecast[2024] - prediction_interval[1]), "to", 
      round(forecast[2024] + prediction_interval[1]))

# Calculate prediction uncertainty
uncertainty = (prediction_interval[1] / forecast[2024]) * 100
print(f"Prediction uncertainty: {uncertainty:.2f}%")

# Optional: Plot the results
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.plot(df.index, df['Faculty'], label='Actual')
plt.plot(df.index, df['Trend'], label='Trend')
plt.plot(forecast.index, forecast, 'r--', label='Forecast')
plt.fill_between(forecast.index, 
                 forecast - prediction_interval, 
                 forecast + prediction_interval, 
                 color='r', alpha=0.1, label='95% Prediction Interval')
plt.legend()
plt.title('Full-Time Faculty Time Series Analysis and Forecast')
plt.xlabel('Year')
plt.ylabel('Number of Full-Time Faculty')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'faculty_data.xlsx'