# Importing data

In [None]:
import pandas as pd

weather = pd.read_csv('weather.csv', index_col="DATE")

In [None]:
weather

# Cleaning the Data

In [None]:
null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct

In [None]:
valid_columns = weather.columns[null_pct < .05]

valid_columns

In [None]:
weather = weather[valid_columns].copy()

In [None]:
weather.columns = weather.columns.str.lower()

In [None]:
weather

# Checking the Data

In [None]:
weather = weather.ffill() #filling all the empty entries

In [None]:
weather.apply(pd.isnull).sum() #checking if all the entries are filled

In [None]:
weather.dtypes #checking data types

In [None]:
weather.index #checking indices of columns

In [None]:
weather.index = pd.to_datetime(weather.index)

In [None]:
weather.index

In [None]:
weather.index.year

In [None]:
weather.index.year.value_counts().sort_index()#checking gaps in the data

In [None]:
weather['snwd'].plot()

In [None]:
weather

In [None]:
weather['target'] = weather.shift(-1)["tmax"]

In [None]:
weather

In [None]:
weather = weather.ffill()

In [None]:
weather

In [None]:
weather[['prcp', 'snow', 'snwd', 'tmax', 'tmin', 'target']].corr() #finding correlation between data

# Building the Model

In [None]:
#Ridge regression is similar to linear regression except it penalizes cofficients to account for multi-coll
from sklearn.linear_model import Ridge

'''
The parameter alpha is replacement of lambda (which is a reserved keyword in python and cannot be used in  Ridge regression).
The alpha parameter controls how much the coefficients are shrunk to account for collinearity
'''
rr = Ridge(alpha=.1)

In [None]:
# these are the columns that will be used to predict the target
predictors = weather.columns[~weather.columns.isin(['target', 'name', 'station'])]

In [None]:
predictors

In [None]:
def backtest(weather, model, predictors, start=3650, step=90):
    '''
    This function takes our data, our ridge regression model, our list of predictions, and a start point of 10 years when the model starts making 
    predictions, and step parameter defines how much data is taken for one set of prediciton(here 90 days)
    '''
    all_predictions = [] #each element in this list will be a dataframe that has prediction for 90 days

    for i in range(start, weather.shape[0], step):
        train = weather.iloc[:i,:]
        test = weather.iloc[:(i+step),:]

        model.fit(train[predictors], train['target'])#fit the model
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)#converting the predictions from numpy array to pandas series
        combined = pd.concat([test["target"],preds],axis=1) #this method combines multiple dataframes or series in one

        combined.columns = ['actual','prediction']

        combined["diff"] = (combined["prediction"] - combined["actual"]).abs() #difference in predicted and actual values

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [None]:
predictions = backtest(weather,rr,predictors)

In [None]:
predictions

#### Testing the accuracy

In [None]:
# Root Mean Squared Error
import numpy as np
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
rmse(predictions["actual"],predictions["prediction"])

# Mean Absolute Error
predictions["diff"].mean()

# Adding more Predictors

In [None]:
#This function calculates the Percentage difference
def pct_diff(old, new):
    return (new-old) / old

#This function does average of the values in a specified horizon(time period)
def compute_rolling(weather, horizon, col):
    label = f"rolling_{horizon}_{col}" #creates a column name

    weather[label] = weather[col].rolling(horizon).mean()
    weather[f'{label}_pct'] = pct_diff(weather[label],weather[col])
    return weather

rolling_horizons = [3,14]

for horizon in rolling_horizons:
    for col in ['tmax','tmin','prcp']:
        weather = compute_rolling(weather , horizon, col)


In [None]:
weather

In [None]:
weather = weather.iloc[14:,:] #removing empty entries

In [None]:
weather = weather.fillna(0) #filling 0 as the value for remaining entries
weather.index.month

In [None]:
#See documentation for expanding, it just adds the upper value to the current value
def expand_mean(df):
    return df.expanding(1).mean()

for col in ["tmax", "tmin", "prcp"]:
    weather[f"month_avg_{col}"] = weather[col].groupby(weather.index.month, group_keys=False).apply(expand_mean)#this takes all the days from specific month, like 1 january 1970, then 1 Jan 1971, and so on
    weather[f"day_avg_{col}"] = weather[col].groupby(weather.index.day_of_year, group_keys=False).apply(expand_mean)#this takes all the days from years,like Date 1 from all months and years, date 2 from all months and years and so on

In [None]:
weather

### Doing Predictions again

In [None]:
predictors = weather.columns[~weather.columns.isin(['target', 'name', 'station'])]

In [None]:
predictors

In [None]:
predictions = backtest(weather, rr, predictors)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(predictions['actual'], predictions['prediction'])

In [None]:
predictions.sort_values('diff',ascending=False)

In [None]:
weather.loc["1990-03-07":"1990-03-17"]

In [None]:
predictions['diff'].round().value_counts().sort_index().plot()