# Project 1
## Time Series Forecasting

### Importing libraries and importing dataset

In [1]:
#importing libraries
import pandas as pd
import numpy as np
from prophet import Prophet
#to ignore warnings
import warnings
warnings.filterwarnings("ignore")
#setting a seed
np.random.seed(7)

### Data preprocessing

In [2]:
#loading dataset
data = pd.read_csv("DATASET.csv");
#dropping rows (as this particular section's count is not 10)
data = data.drop(index=[910, 911, 912, 913, 914, 915, 916, 917, 918])

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,year,Para-1,Para-2,Para-3,Para-4,Para-5,Para-6,Para-7,Para-8,Para-9,Para-10,Para-11,Para-12,Para-13
0,Sec_1,1,324.0,354.5,17.0,0.0,3,117,2600,400,0.0,0.0,0.0,0,1.52
1,,2,324.0,161.1,18.2,0.0,4,106,5950,1190,0.0,3.4,0.0,0,1.62
2,,3,324.0,170.7,18.5,0.0,4,110,5950,1190,0.0,21.3,0.0,4,1.68
3,,4,324.0,223.9,18.9,0.0,3,110,6150,1340,0.0,21.3,0.0,5,1.78
4,,5,324.0,228.2,18.4,0.0,2,113,6340,1450,0.0,23.9,0.0,7,1.8


In [4]:
errors = [] #list of rmse of each section
sections = [] #list of sections from the dataset

#loop for obtaining sections of 10 rows each
for i in range(0, len(data), 10):
    sections.append(data[i:i+10])

In [5]:
#choosing indices of 15 random sections from the sections list for training and forecasting
indices = np.random.choice(np.arange(len(sections)), 15, replace = False)

### Model training and predictions

In [6]:
#main loop for each randomly chosen section
for i in indices:
    section = sections[i]
    #dropping the first columne (unnamed)
    section.drop(columns=section.columns[[0]], inplace=True)
    #replacing the values in the "year" column with actual dates in place of numbers
    section['year'] = pd.date_range(start='12/1/2012', periods=10, freq=pd.offsets.MonthEnd(12))
    parameters = [] #parameters stores the list of multiple 2d lists in the form of 'year' as first column and Para-[9-13] as second column

    #the loop to obtain values of parameters
    for i in range(5):
        p = section.iloc[: , [0, i+9]]
        #converting names of the columns due to facebook prophet's documentation
        p.rename(columns = {'year': 'ds' , p.columns.tolist()[1]: 'y'}, inplace = True)
        p = p.reset_index()
        parameters.append(p)
        
    all_forecasts = [] #this list stores the results of all the model predictions
    for parameter in parameters:
        #training
        m = Prophet()
        m.fit(parameter.iloc[:, :8]) #taking the only first 9 elements as training data and the last element will be choosen to find the rmse
        #forecasting on yearly basis as freq = 'Y'
        future = m.make_future_dataframe(periods=1,freq='Y')
        #making prediction
        forecast = m.predict(future)
        all_forecasts.append(forecast)
        
        
    preds, actual = [], [] #pred come from forecast predictions and actual are the last elements specified above

    for forecast in all_forecasts:
        preds.append(forecast['yhat'][9]) #the yhat column of the forecast predictions contains the prediction
    for parameter in parameters:
        actual.append(parameter['y'][9])
        
    #finding the rmse    
    from sklearn.metrics import mean_squared_error
    errors.append(mean_squared_error(preds, actual))

12:51:16 - cmdstanpy - INFO - Chain [1] start processing
12:51:16 - cmdstanpy - INFO - Chain [1] done processing
12:51:16 - cmdstanpy - INFO - Chain [1] start processing
12:51:16 - cmdstanpy - INFO - Chain [1] done processing
12:51:17 - cmdstanpy - INFO - Chain [1] start processing
12:51:17 - cmdstanpy - INFO - Chain [1] done processing
12:51:17 - cmdstanpy - INFO - Chain [1] start processing
12:51:17 - cmdstanpy - INFO - Chain [1] done processing
12:51:17 - cmdstanpy - INFO - Chain [1] start processing
12:51:17 - cmdstanpy - INFO - Chain [1] done processing
12:51:18 - cmdstanpy - INFO - Chain [1] start processing
12:51:18 - cmdstanpy - INFO - Chain [1] done processing
12:51:19 - cmdstanpy - INFO - Chain [1] start processing
12:51:19 - cmdstanpy - INFO - Chain [1] done processing
12:51:19 - cmdstanpy - INFO - Chain [1] start processing
12:51:19 - cmdstanpy - INFO - Chain [1] done processing
12:51:19 - cmdstanpy - INFO - Chain [1] start processing
12:51:24 - cmdstanpy - INFO - Chain [1]

### Outputs

In [7]:
errors #all the rmse for each randomly chosen section

[269.0456399888305,
 534.4103773375945,
 0.7450611906815272,
 13.719842649729362,
 36.05195418380234,
 160.33687551140952,
 4.7031135790337535,
 78.53638675657176,
 325.6168736574325,
 207.5752898515746,
 109.55166065025642,
 352.51630932354846,
 50.46548794986806,
 453.9766273207788,
 2.7261498413448164]

In [8]:
final_rmse = np.mean(errors) #taking mean of all the rmse

In [9]:
final_rmse

173.33184331949712