In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('academic.csv')

# Convert 'year' to datetime format
df['year'] = pd.to_datetime(df['year'], format='%Y/%m')

# Set 'year' as the index
df.set_index('year', inplace=True)

# Display the first few rows
df.head()


ValueError: unconverted data remains when parsing with format "%Y/%m": "9", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
# Check the unique values in the 'year' column
print(df['year'].unique())


['1948/49' '1949/50' '1950/51' '1951/52' '1952/53' '1953/54' '1954/55'
 '1955/56' '1956/57' '1957/58' '1958/59' '1959/60' '1960/61' '1961/62'
 '1962/63' '1963/64' '1964/65' '1965/66' '1966/67' '1967/68' '1968/69'
 '1969/70' '1970/71' '1971/72' '1972/73' '1973/74' '1974/75' '1975/76'
 '1976/77' '1977/78' '1978/79' '1979/80' '1980/81' '1981/82' '1982/83'
 '1983/84' '1984/85' '1985/86' '1986/87' '1987/88' '1988/89' '1989/90'
 '1990/91' '1991/92' '1992/93' '1993/94' '1994/95' '1995/96' '1996/97'
 '1997/98' '1998/99' '1999/00' '2000/01' '2001/02' '2002/03' '2003/04'
 '2004/05' '2005/06' '2006/07' '2007/08' '2008/09' '2009/10' '2010/11'
 '2011/12' '2012/13' '2013/14' '2014/15' '2015/16' '2016/17' '2017/18'
 '2018/19' '2019/20' '2020/21' '2021/22' '2022/23']


In [None]:
# Extract the first part of the year (before '/')
df['year'] = df['year'].apply(lambda x: x.split('/')[0])

# Convert the 'year' column to datetime
df['year'] = pd.to_datetime(df['year'], format='%Y')

# Check the conversion
print(df['year'].head())


0   1948-01-01
1   1949-01-01
2   1950-01-01
3   1951-01-01
4   1952-01-01
Name: year, dtype: datetime64[ns]


In [None]:
from sklearn.model_selection import train_test_split

# Select the column to forecast (e.g., 'students')
column_to_forecast = 'students'

# Split the data into training and test sets
train, test = train_test_split(df[[column_to_forecast, 'year']], test_size=0.2, shuffle=False)

# Check the training and testing data
train.head(), test.head()


(   students       year
 0     25464 1948-01-01
 1     26433 1949-01-01
 2     29813 1950-01-01
 3     30462 1951-01-01
 4     33675 1952-01-01,
     students       year
 60    671616 2008-01-01
 61    690923 2009-01-01
 62    723277 2010-01-01
 63    764495 2011-01-01
 64    819644 2012-01-01)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit an ARIMA model (adjust the p, d, q parameters based on your dataset)
model = ARIMA(train[column_to_forecast], order=(5, 1, 0))
model_fit = model.fit()

# Make predictions on the test set
forecast = model_fit.forecast(steps=len(test))

# Create a dataframe with the predictions
test['forecast'] = forecast

# Compare the actual vs predicted values
print(test[['year', column_to_forecast, 'forecast']])


         year  students       forecast
60 2008-01-01    671616  657468.491349
61 2009-01-01    690923  681820.235117
62 2010-01-01    723277  695092.283311
63 2011-01-01    764495  700617.114310
64 2012-01-01    819644  706530.816333
65 2013-01-01    886052  714097.514930
66 2014-01-01    974926  723386.335996
67 2015-01-01   1043839  732759.984377
68 2016-01-01   1078822  740244.513025
69 2017-01-01   1094792  745717.225680
70 2018-01-01   1095299  749613.481339
71 2019-01-01   1075496  752759.187144
72 2020-01-01    914095  755796.499988
73 2021-01-01    948519  758805.829149
74 2022-01-01   1057188  761653.525070


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate evaluation metrics
mae = mean_absolute_error(test[column_to_forecast], test['forecast'])
mse = mean_squared_error(test[column_to_forecast], test['forecast'])

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')


Mean Absolute Error (MAE): 197507.99752548567
Mean Squared Error (MSE): 54406575850.52158


In [None]:
# Forecast future values (next 5 years, for example)
future_steps = 5
future_forecast = model_fit.forecast(steps=future_steps)

print(f'Future Forecast for the next {future_steps} years: {future_forecast}')


Future Forecast for the next 5 years: 60    657468.491349
61    681820.235117
62    695092.283311
63    700617.114310
64    706530.816333
Name: predicted_mean, dtype: float64


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

# List of columns to forecast
columns_to_forecast = ['students', 'us_students', 'undergraduate', 'postgraduate', 'total_students']  # Add more columns if needed

# Dictionary to store the forecasts and errors for each column
forecast_results = {}

for column in columns_to_forecast:
    print(f"Forecasting column: {column}")

    # Split the data into training and test sets
    train, test = train_test_split(df[[column, 'year']], test_size=0.2, shuffle=False)

    # Train the ARIMA model
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Make predictions on the test set
    forecast = model_fit.forecast(steps=len(test))

    # Add the forecasted values to the test data
    test['forecast'] = forecast

    # Calculate evaluation metrics
    mae = mean_absolute_error(test[column], test['forecast'])
    mse = mean_squared_error(test[column], test['forecast'])

    # Store the results for each column
    forecast_results[column] = {
        'actual': test[column],
        'forecast': test['forecast'],
        'MAE': mae,
        'MSE': mse
    }

    # Print out the actual vs forecast values for the current column
    print(test[['year', column, 'forecast']])
    print(f"Mean Absolute Error (MAE) for {column}: {mae}")
    print(f"Mean Squared Error (MSE) for {column}: {mse}\n")

# Optionally, store the forecast results as a DataFrame
all_forecasts = {col: pd.DataFrame(result) for col, result in forecast_results.items()}


Forecasting column: students
         year  students       forecast
60 2008-01-01    671616  657468.491349
61 2009-01-01    690923  681820.235117
62 2010-01-01    723277  695092.283311
63 2011-01-01    764495  700617.114310
64 2012-01-01    819644  706530.816333
65 2013-01-01    886052  714097.514930
66 2014-01-01    974926  723386.335996
67 2015-01-01   1043839  732759.984377
68 2016-01-01   1078822  740244.513025
69 2017-01-01   1094792  745717.225680
70 2018-01-01   1095299  749613.481339
71 2019-01-01   1075496  752759.187144
72 2020-01-01    914095  755796.499988
73 2021-01-01    948519  758805.829149
74 2022-01-01   1057188  761653.525070
Mean Absolute Error (MAE) for students: 197507.99752548567
Mean Squared Error (MSE) for students: 54406575850.52158

Forecasting column: us_students
         year  us_students      forecast
60 2008-01-01   19103000.0  1.849243e+07
61 2009-01-01   20428000.0  1.869809e+07
62 2010-01-01   20550000.0  1.880833e+07
63 2011-01-01   20625000.0  1.8904

KeyError: "['postgraduate'] not in index"

In [None]:
# Print the column names to ensure 'postgraduate' exists
print(df.columns)


Index(['year', 'students', 'us_students', 'undergraduate', 'graduate',
       'non_degree', 'opt'],
      dtype='object')


In [None]:
columns_to_forecast = ['students', 'us_students', 'undergraduate', 'graduate', 'non_degree', 'opt']

forecast_results = {}

for column in columns_to_forecast:
    if column not in df.columns:
        print(f"Column {column} is not in the dataset. Skipping forecasting for this column.")
        continue

    print(f"Forecasting column: {column}")

    # Split the data into training and test sets
    train, test = train_test_split(df[[column, 'year']], test_size=0.2, shuffle=False)

    # Train the ARIMA model
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Make predictions on the test set
    forecast = model_fit.forecast(steps=len(test))

    # Add the forecasted values to the test data
    test['forecast'] = forecast

    # Calculate evaluation metrics
    mae = mean_absolute_error(test[column], test['forecast'])
    mse = mean_squared_error(test[column], test['forecast'])

    # Store the results for each column
    forecast_results[column] = {
        'actual': test[column],
        'forecast': test['forecast'],
        'MAE': mae,
        'MSE': mse
    }

    # Print out the actual vs forecast values for the current column
    print(test[['year', column, 'forecast']])
    print(f"Mean Absolute Error (MAE) for {column}: {mae}")
    print(f"Mean Squared Error (MSE) for {column}: {mse}\n")


Forecasting column: students
         year  students       forecast
60 2008-01-01    671616  657468.491349
61 2009-01-01    690923  681820.235117
62 2010-01-01    723277  695092.283311
63 2011-01-01    764495  700617.114310
64 2012-01-01    819644  706530.816333
65 2013-01-01    886052  714097.514930
66 2014-01-01    974926  723386.335996
67 2015-01-01   1043839  732759.984377
68 2016-01-01   1078822  740244.513025
69 2017-01-01   1094792  745717.225680
70 2018-01-01   1095299  749613.481339
71 2019-01-01   1075496  752759.187144
72 2020-01-01    914095  755796.499988
73 2021-01-01    948519  758805.829149
74 2022-01-01   1057188  761653.525070
Mean Absolute Error (MAE) for students: 197507.99752548567
Mean Squared Error (MSE) for students: 54406575850.52158

Forecasting column: us_students
         year  us_students      forecast
60 2008-01-01   19103000.0  1.849243e+07
61 2009-01-01   20428000.0  1.869809e+07
62 2010-01-01   20550000.0  1.880833e+07
63 2011-01-01   20625000.0  1.8904

  warn('Non-stationary starting autoregressive parameters'


         year       opt       forecast
60 2008-01-01   66601.0   58678.893746
61 2009-01-01   67804.0   64715.904826
62 2010-01-01   76031.0   74018.894083
63 2011-01-01   85157.0   78225.333659
64 2012-01-01   94919.0   84718.533117
65 2013-01-01  105997.0   91223.182992
66 2014-01-01  120287.0   96350.152064
67 2015-01-01  147498.0  102295.686062
68 2016-01-01  175695.0  107795.895340
69 2017-01-01  203462.0  112977.533735
70 2018-01-01  223085.0  118310.295761
71 2019-01-01  223539.0  123329.431427
72 2020-01-01  203885.0  128239.602420
73 2021-01-01  184759.0  133082.067744
74 2022-01-01  198793.0  137733.810590
Mean Absolute Error (MAE) for opt: 44387.78549551738
Mean Squared Error (MSE) for opt: 3266753945.354979



In [None]:
# Example of how you would generate forecasted values for each column (ensure these are defined first)

# Assuming 'forecast_students', 'forecast_us_students', etc., are the forecasted values you calculated

# For example:
forecast_students = students_forecasted_values  # Replace with actual forecasted values
forecast_us_students = us_students_forecasted_values  # Replace with actual forecasted values
forecast_undergraduate = undergraduate_forecasted_values  # Replace with actual forecasted values
forecast_graduate = graduate_forecasted_values  # Replace with actual forecasted values
forecast_non_degree = non_degree_forecasted_values  # Replace with actual forecasted values
forecast_opt = opt_forecasted_values  # Replace with actual forecasted values

# Create the DataFrame for forecasted values
forecast_df = pd.DataFrame({
    'year': df['year'],  # Original year column (replace 'df' with your DataFrame name)
    'students_forecast': forecast_students,
    'us_students_forecast': forecast_us_students,
    'undergraduate_forecast': forecast_undergraduate,
    'graduate_forecast': forecast_graduate,
    'non_degree_forecast': forecast_non_degree,
    'opt_forecast': forecast_opt
})

# Save the DataFrame to a CSV file
forecast_df.to_csv('forecasted_values.csv', index=False)

print("Forecasted values saved to 'forecasted_values.csv'")


NameError: name 'students_forecasted_values' is not defined

In [None]:
# Prepare an empty list to store forecasted values for each column
forecasted_values = {'year': df['year'][-len(test):]}  # Get the last 'len(test)' years for forecasting

# Loop over each column to get the forecasted values
for column in columns_to_forecast:
    if column not in df.columns:
        print(f"Column {column} is not in the dataset. Skipping forecasting for this column.")
        continue

    print(f"Forecasting column: {column}")

    # Split the data into training and test sets
    train, test = train_test_split(df[[column, 'year']], test_size=0.2, shuffle=False)

    # Train the ARIMA model
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Make predictions on the test set
    forecast = model_fit.forecast(steps=len(test))

    # Add the forecasted values to the test data
    test['forecast'] = forecast

    # Store the forecasted values for the current column
    forecasted_values[f'{column}_forecast'] = test['forecast'].values

# Convert the forecasted values into a DataFrame
forecast_df = pd.DataFrame(forecasted_values)

# Save the DataFrame to a CSV file
forecast_df.to_csv('forecasted_values.csv', index=False)

print("Forecasted values saved to 'forecasted_values.csv'")


Forecasting column: students
Forecasting column: us_students
Forecasting column: undergraduate
Forecasting column: graduate
Forecasting column: non_degree
Forecasting column: opt
Forecasted values saved to 'forecasted_values.csv'


  warn('Non-stationary starting autoregressive parameters'


In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# List of columns to forecast
columns_to_forecast = ['students', 'us_students', 'undergraduate', 'graduate', 'non_degree', 'opt']

# Prepare an empty dictionary to store the forecasted values
forecasted_values = {}

# Convert the 'year' column to integer if it's a datetime
df['year'] = df['year'].dt.year if pd.api.types.is_datetime64_any_dtype(df['year']) else df['year']

# Loop over each column to get the forecasted values
for column in columns_to_forecast:
    if column not in df.columns:
        print(f"Column {column} is not in the dataset. Skipping forecasting for this column.")
        continue

    print(f"Forecasting column: {column}")

    # Use the historical data for training
    train = df[['year', column]]

    # Train the ARIMA model on the entire historical dataset
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Forecast the next 10 years
    forecast = model_fit.forecast(steps=10)  # Forecast the next 10 years

    # Get the last year in the dataset
    last_year = df['year'].iloc[-1]

    # Generate the next 10 years
    future_years = np.arange(last_year + 1, last_year + 11)  # Future years as integers

    # Combine the original and forecasted data
    forecasted_values[column] = np.concatenate([df[column].values, forecast])

    # Store the forecasted values for the 'year' column
    if column == 'students':  # You can change this check based on the first column you'd like to add
        forecasted_values['year'] = np.concatenate([df['year'].values, future_years])

# Convert the forecasted values into a DataFrame
forecast_df = pd.DataFrame(forecasted_values)

# Save the DataFrame to a CSV file
forecast_df.to_csv('forecasted_values_combined.csv', index=False)

print("Forecasted values (original + next 10 years) saved to 'forecasted_values_combined.csv'")


Forecasting column: students
Forecasting column: us_students
Forecasting column: undergraduate
Forecasting column: graduate
Forecasting column: non_degree
Forecasting column: opt
Forecasted values (original + next 10 years) saved to 'forecasted_values_combined.csv'


  warn('Non-stationary starting autoregressive parameters'


In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# List of columns to forecast
columns_to_forecast = ['students', 'us_students', 'undergraduate', 'graduate', 'non_degree', 'opt']

# Prepare an empty dictionary to store the forecasted values
forecasted_values = {}

# Convert the 'year' column to integer if it's a datetime
df['year'] = df['year'].dt.year if pd.api.types.is_datetime64_any_dtype(df['year']) else df['year']

# Loop over each column to get the forecasted values
for column in columns_to_forecast:
    if column not in df.columns:
        print(f"Column {column} is not in the dataset. Skipping forecasting for this column.")
        continue

    print(f"Forecasting column: {column}")

    # Use the historical data for training
    train = df[['year', column]]

    # Train the ARIMA model on the entire historical dataset
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Forecast the next 10 years
    forecast = model_fit.forecast(steps=10)  # Forecast the next 10 years

    # Get the last year in the dataset
    last_year = df['year'].iloc[-1]

    # Generate the next 10 years
    future_years = np.arange(last_year + 1, last_year + 11)  # Future years as integers

    # Combine the original and forecasted data for each column
    forecasted_values[column] = np.concatenate([df[column].values, forecast])

    # Store the forecasted values for the 'year' column
    if column == 'students':  # You can change this check based on the first column you'd like to add
        forecasted_values['year'] = np.concatenate([df['year'].values, future_years])

# Now, create separate columns for original and forecasted data
for column in columns_to_forecast:
    forecasted_values[f'{column}_forecasted'] = forecasted_values.pop(column)

# Convert the forecasted values into a DataFrame
forecast_df = pd.DataFrame(forecasted_values)

# Save the DataFrame to a CSV file
forecast_df.to_csv('forecasted_values_separated.csv', index=False)

print("Forecasted values (original + next 10 years) saved to 'forecasted_values_separated.csv'")


Forecasting column: students
Forecasting column: us_students
Forecasting column: undergraduate
Forecasting column: graduate
Forecasting column: non_degree
Forecasting column: opt
Forecasted values (original + next 10 years) saved to 'forecasted_values_separated.csv'


  warn('Non-stationary starting autoregressive parameters'


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

# Load the dataset
df = pd.read_csv('status.csv')

# Clean the 'year' column (remove any unwanted characters or extra data)
df['year'] = df['year'].str.replace(r'[^0-9/]', '', regex=True)

# Convert the 'year' column to datetime format (if it's not already)
df['year'] = pd.to_datetime(df['year'], format='%Y/%m', errors='coerce')

# Check for any invalid rows and drop them
df = df.dropna(subset=['year'])

# List of columns to forecast
columns_to_forecast = ['female', 'male', 'single', 'married', 'full_time', 'part_time', 'visa_f', 'visa_j', 'visa_other']

forecasted_values = []

# Loop over each column for forecasting
for column in columns_to_forecast:
    print(f"Forecasting column: {column}")

    # Split the data into training and test sets
    train = df[[column, 'year']].dropna()

    # Train the ARIMA model
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Make predictions for the next 10 years
    forecast = model_fit.forecast(steps=10)

    # Create a future years array
    future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')

    # Store the forecasted values
    forecasted_values.append(forecast)

    # Print the forecasted values for this column
    print(f"Forecast for the next 10 years ({column}):")
    for year, value in zip(future_years, forecast):
        print(f"{year.strftime('%Y')}: {value}")

# Combine the forecasted values into a new DataFrame
forecast_df = pd.DataFrame({
    'year': future_years,
    'female_forecast': forecasted_values[0],
    'male_forecast': forecasted_values[1],
    'single_forecast': forecasted_values[2],
    'married_forecast': forecasted_values[3],
    'full_time_forecast': forecasted_values[4],
    'part_time_forecast': forecasted_values[5],
    'visa_f_forecast': forecasted_values[6],
    'visa_j_forecast': forecasted_values[7],
    'visa_other_forecast': forecasted_values[8]
})

# Append the forecasted data to the original data
final_df = pd.concat([df, forecast_df], ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('forecasted_status.csv', index=False)

print("Forecasted values saved to 'forecasted_status.csv'")


Forecasting column: female
Forecast for the next 10 years (female):
2012: 341685.5185286674
2013: 342853.44905913516
2014: 344289.88407646696
2015: 345361.05373091024
2016: 345665.3869129877
2017: 345816.67425225774
2018: 345948.2140012368
2019: 346027.4035283656
2020: 346056.8203942431
2021: 346072.93399815605
Forecasting column: male
Forecast for the next 10 years (male):
2012: 430200.3525095821
2013: 433200.8450867899
2014: 436096.8679325571
2015: 438311.32489161356
2016: 438988.29514966876
2017: 439450.7893220483
2018: 439809.4175382087
2019: 440031.69421891216
2020: 440126.1784425219
2021: 440188.649064232
Forecasting column: single


  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), pe

Forecast for the next 10 years (single):
2012: 698473.742251755
2013: 703831.7936829912
2014: 708819.5173234596
2015: 711895.011745204
2016: 713062.4681168086
2017: 713844.4453127778
2018: 714403.6314073072
2019: 714712.6181787958
2020: 714865.752532087
2021: 714964.2172382865
Forecasting column: married
Forecast for the next 10 years (married):
2012: 74856.51811258856
2013: 74770.63299200126
2014: 74021.93569192979
2015: 74248.61117099866
2016: 74339.2874769377
2017: 74379.16460713008
2018: 74312.66738961436
2019: 74316.6046352689
2020: 74322.64382930257
2021: 74329.89236005089
Forecasting column: full_time
Forecast for the next 10 years (full_time):
2012: 721502.0359649773
2013: 726633.8442514226
2014: 731491.5552660194
2015: 735518.0885926391
2016: 736684.7242836307
2017: 737477.8806790399
2018: 738096.154284786
2019: 738501.0478716099
2020: 738665.2719946385
2021: 738773.5951628804
Forecasting column: part_time
Forecast for the next 10 years (part_time):
2012: 47186.385294494656
20

  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=df['year'].iloc[-1] + pd.DateOffset(years=1), periods=10, freq='Y')


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

# Load the dataset
df = pd.read_csv('status.csv')

# Clean the 'year' column (remove any unwanted characters or extra data)
df['year'] = df['year'].str.replace(r'[^0-9/]', '', regex=True)

# Convert the 'year' column to datetime format (if it's not already)
df['year'] = pd.to_datetime(df['year'], format='%Y/%m', errors='coerce')

# Check for any invalid rows and drop them
df = df.dropna(subset=['year'])

# List of columns to forecast
columns_to_forecast = ['female', 'male', 'single', 'married', 'full_time', 'part_time', 'visa_f', 'visa_j', 'visa_other']

forecasted_values = []

# Loop over each column for forecasting
for column in columns_to_forecast:
    print(f"Forecasting column: {column}")

    # Split the data into training and test sets
    train = df[[column, 'year']].dropna()

    # Train the ARIMA model
    model = ARIMA(train[column], order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()

    # Make predictions for the next 10 years (to 2032)
    forecast = model_fit.forecast(steps=12)  # Forecasting for 12 steps (for the next 12 years, including 2022 to 2032)

    # Create a future years array starting from the next year after the last year in the dataset
    last_year = df['year'].iloc[-1].year
    future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032

    # Store the forecasted values
    forecasted_values.append(forecast)

    # Print the forecasted values for this column
    print(f"Forecast for the next 12 years ({column}):")
    for year, value in zip(future_years, forecast):
        print(f"{year}: {value}")

# Create a new DataFrame for the forecasted values
forecast_df = pd.DataFrame({
    'year': future_years,
    'female_forecast': forecasted_values[0],
    'male_forecast': forecasted_values[1],
    'single_forecast': forecasted_values[2],
    'married_forecast': forecasted_values[3],
    'full_time_forecast': forecasted_values[4],
    'part_time_forecast': forecasted_values[5],
    'visa_f_forecast': forecasted_values[6],
    'visa_j_forecast': forecasted_values[7],
    'visa_other_forecast': forecasted_values[8]
})

# Append the forecasted values to the original DataFrame
final_df = pd.concat([df, forecast_df], ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('forecasted_status_with_existing_data.csv', index=False)

print("Forecasted values and existing data saved to 'forecasted_status_with_existing_data.csv'")


Forecasting column: female
Forecast for the next 12 years (female):
2012: 341685.5185286674
2013: 342853.44905913516
2014: 344289.88407646696
2015: 345361.05373091024
2016: 345665.3869129877
2017: 345816.67425225774
2018: 345948.2140012368
2019: 346027.4035283656
2020: 346056.8203942431
2021: 346072.93399815605
2022: 346084.4226221949
2023: 346090.74633243354
Forecasting column: male
Forecast for the next 12 years (male):
2012: 430200.3525095821
2013: 433200.8450867899
2014: 436096.8679325571
2015: 438311.32489161356
2016: 438988.29514966876
2017: 439450.7893220483
2018: 439809.4175382087
2019: 440031.69421891216
2020: 440126.1784425219
2021: 440188.649064232
2022: 440231.13970696635
2023: 440255.6807201627
Forecasting column: single
Forecast for the next 12 years (single):
2012: 698473.742251755
2013: 703831.7936829912
2014: 708819.5173234596
2015: 711895.011745204
2016: 713062.4681168086
2017: 713844.4453127778
2018: 714403.6314073072
2019: 714712.6181787958
2020: 714865.752532087
20

  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'


Forecast for the next 12 years (part_time):
2012: 47186.385294494656
2013: 47869.066656708914
2014: 48141.36645944755
2015: 48167.31827737925
2016: 48626.80145861174
2017: 48694.514525009734
2018: 48585.49091035556
2019: 48593.80484300373
2020: 48567.39006011887
2021: 48536.10561772746
2022: 48548.05397632631
2023: 48553.597161090365
Forecasting column: visa_f
Forecast for the next 12 years (visa_f):
2012: 696217.8284755218
2013: 701417.9904145868
2014: 706576.9666467314
2015: 710568.6807443756
2016: 711767.7639743121
2017: 712605.603405805
2018: 713269.4814947724
2019: 713684.1731603119
2020: 713859.0789464889
2021: 713977.0498030154
2022: 714058.6550588236
2023: 714106.0529556664
Forecasting column: visa_j
Forecast for the next 12 years (visa_j):
2012: 42176.77902946069
2013: 42192.19063282401
2014: 42343.90517709572
2015: 42403.58118094237
2016: 42402.49098650359
2017: 42409.90233950435
2018: 42418.79481929746
2019: 42420.43384667474
2020: 42420.479589564675
2021: 42421.21340065697


  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032
  warn('Too few observations to estimate starting parameters%s.'
  future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=12, freq='Y').year  # Generate years until 2032


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Check the columns to see if 'year' exists
print(df.columns)

# Ensure 'year' is in datetime format and extract the year (if needed)
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year

# Forecasting for the next 9 years (from 2024 to 2032)
last_year = df['year'].max()  # Dynamically set last year from the dataset
future_years = pd.date_range(start=f'{last_year + 1}-01-01', periods=9, freq='YE-DEC').year  # Use 'YE-DEC' to avoid deprecation warning

# Example: Forecasting column 'married'
y = df['married']  # The column you want to forecast

# Fit the SARIMAX model (adjust parameters as necessary)
model = SARIMAX(y, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Example model, modify as needed
model_fit = model.fit(disp=False)

# Forecast for the next 9 years (2024 to 2032)
forecast = model_fit.get_forecast(steps=9)  # Forecast for the next 9 years
forecast_values = forecast.predicted_mean

# Combine the existing data with the forecasted values
forecast_df = pd.DataFrame({
    'Year': future_years,
    'Forecasted_married': forecast_values
})

# Optionally, merge it back with your existing dataset if needed
existing_data = pd.DataFrame({
    'Year': df['year'],  # Use 'year' column here
    'Married': df['married']
})
full_data = pd.concat([existing_data, forecast_df])

# Print the forecasted values
print(full_data)

# Save the forecasted data to a CSV file
full_data.to_csv('forecasted_data.csv', index=False)


Index(['year', 'female', 'male', 'single', 'married', 'full_time', 'part_time',
       'visa_f', 'visa_j', 'visa_other'],
      dtype='object')
    Year  Married  Forecasted_married
0   2007  79847.0                 NaN
1   2008  79922.0                 NaN
2   2009  75311.0                 NaN
3   2010  69435.0                 NaN
4   2011  74156.0                 NaN
5   2012      NaN        74156.000000
6   2013      NaN        74156.000000
7   2014      NaN        74156.000000
8   2015      NaN        74156.000000
9   2016      NaN        74156.000000
10  2017      NaN        74156.000000
11  2018      NaN        74156.000000
12  2019      NaN       114079.519962
13  2020      NaN       114154.519962


  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'


In [None]:
print(df.columns)


Index(['year', 'female', 'male', 'single', 'married', 'full_time', 'part_time',
       'visa_f', 'visa_j', 'visa_other'],
      dtype='object')


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Check the columns to see if 'year' exists
print(df.columns)

# Ensure 'year' is in datetime format and extract the year (if needed)
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year

# Get the last year from your dataset
last_year = df['year'].max()  # Dynamically set last year from the dataset

# Forecasting for the next years (from the current year until 2032)
future_years = pd.date_range(start=f'{last_year + 1}-01-01', end='2032-12-31', freq='A-DEC').year

# Example: Forecasting column 'married'
y = df['married']  # The column you want to forecast

# Fit the SARIMAX model (adjust parameters as necessary)
model = SARIMAX(y, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Example model, modify as needed
model_fit = model.fit(disp=False)

# Forecast for the years until 2032
forecast = model_fit.get_forecast(steps=len(future_years))  # Forecast until 2032
forecast_values = forecast.predicted_mean

# Combine the existing data with the forecasted values
forecast_df = pd.DataFrame({
    'Year': future_years,
    'Forecasted_married': forecast_values
})

# Optionally, merge it back with your existing dataset if needed
existing_data = pd.DataFrame({
    'Year': df['year'],  # Use 'year' column here
    'Married': df['married']
})
full_data = pd.concat([existing_data, forecast_df])

# Print the forecasted values
print(full_data)

# Save the forecasted data to a CSV file
full_data.to_csv('forecasted_data_until_2032.csv', index=False)


Index(['year', 'female', 'male', 'single', 'married', 'full_time', 'part_time',
       'visa_f', 'visa_j', 'visa_other'],
      dtype='object')
    Year  Married  Forecasted_married
0   2007  79847.0                 NaN
1   2008  79922.0                 NaN
2   2009  75311.0                 NaN
3   2010  69435.0                 NaN
4   2011  74156.0                 NaN
5   2012      NaN        74156.000000
6   2013      NaN        74156.000000
7   2014      NaN        74156.000000
8   2015      NaN        74156.000000
9   2016      NaN        74156.000000
10  2017      NaN        74156.000000
11  2018      NaN        74156.000000
12  2019      NaN       114079.519962
13  2020      NaN       114154.519962
14  2021      NaN       109543.519962
15  2022      NaN       103667.519962
16  2023      NaN       108388.519962
17  2024      NaN       108388.519962
18  2025      NaN       108388.519962
19  2026      NaN       108388.519962
20  2027      NaN       108388.519962
21  2028      NaN   

  future_years = pd.date_range(start=f'{last_year + 1}-01-01', end='2032-12-31', freq='A-DEC').year
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'


In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Check the columns to see if 'year' exists
print(df.columns)

# Ensure 'year' is in datetime format and extract the year (if needed)
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year

# Get the last year from your dataset
last_year = df['year'].max()  # Dynamically set last year from the dataset

# Forecasting for the next years (from the current year until 2032)
future_years = pd.date_range(start=f'{last_year + 1}-01-01', end='2032-12-31', freq='A-DEC').year

# Prepare a DataFrame to store the forecasted data
forecasted_data = pd.DataFrame({'Year': future_years})

# Loop through each column and forecast the data
for column in df.columns:
    if column != 'year':  # Skip the 'year' column
        y = df[column]  # Get the column to forecast

        # Fit the SARIMAX model (adjust parameters as necessary)
        model = SARIMAX(y, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # Example model, modify as needed
        model_fit = model.fit(disp=False)

        # Forecast for the years until 2032
        forecast = model_fit.get_forecast(steps=len(future_years))  # Forecast until 2032
        forecast_values = forecast.predicted_mean

        # Add the forecasted values as a new column to the forecasted_data DataFrame
        forecasted_data[column] = forecast_values

# Optionally, merge it back with your existing dataset if needed
existing_data = df[['year']]  # Use 'year' column here
full_data = pd.concat([existing_data, forecasted_data], axis=1)

# Print the forecasted values
print(full_data)

# Save the forecasted data to a CSV file
full_data.to_csv('forecasted_data_all_columns_until_2032.csv', index=False)


Index(['year', 'female', 'male', 'single', 'married', 'full_time', 'part_time',
       'visa_f', 'visa_j', 'visa_other'],
      dtype='object')


  future_years = pd.date_range(start=f'{last_year + 1}-01-01', end='2032-12-31', freq='A-DEC').year
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting param

      year  Year        female           male        single        married  \
0   2007.0  2012           NaN            NaN           NaN            NaN   
1   2008.0  2013           NaN            NaN           NaN            NaN   
2   2009.0  2014           NaN            NaN           NaN            NaN   
3   2010.0  2015           NaN            NaN           NaN            NaN   
4   2011.0  2016           NaN            NaN           NaN            NaN   
5      NaN  2017  338671.00000  425824.000000  6.903390e+05   74156.000000   
6      NaN  2018  338671.00000  425824.000000  6.903390e+05   74156.000000   
7      NaN  2019  338671.00000  425824.000000  6.903390e+05   74156.000000   
8      NaN  2020  338671.00000  425824.000000  6.903390e+05   74156.000000   
9      NaN  2021  338671.00000  425824.000000  6.903390e+05   74156.000000   
10     NaN  2022  338671.00000  425824.000000  6.903390e+05   74156.000000   
11     NaN  2023  338671.00000  425824.000000  6.903390e+05   74

  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'
