In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [2]:
df = pd.read_csv('Data/blr_data_final.csv')

In [3]:
df = df.drop('Datetime',axis=1)

In [4]:
df.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,Time_Season_Mean
0,30.42,69.58,2.04,12.4,7.58,19.1,0.19,4.05,12.41,0.56,3.43,84.0,104.679061
1,27.56,59.66,2.2,12.11,7.54,17.81,0.21,4.23,12.13,0.56,4.54,83.0,103.78865
2,24.94,53.6,1.66,10.82,6.52,17.42,0.3,4.28,13.13,0.52,4.2,82.0,103.111546
3,22.94,49.92,1.92,10.32,6.46,16.86,0.13,4.14,11.82,0.47,4.63,81.0,101.90411
4,22.57,49.755,1.94,9.82,6.22,16.35,0.15,4.26,10.31,0.48,3.91,78.0,99.74364


In [5]:
X = df.drop('AQI',axis=1)
y = df['AQI']

## Splitting Data for forcasting

In [6]:
training_size = int(len(df)*0.65)
test_size = len(df)-training_size

X_train = X[:training_size]
X_test = X[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]

In [13]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((31324, 12), (16868, 12), (31324,), (16868,))

## Scaling the data

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
sc_X = MinMaxScaler()
sc_y = MinMaxScaler()

In [18]:
X_train_sc = sc_X.fit_transform(X_train)
y_train_sc = sc_y.fit_transform(np.array(y_train).reshape(-1, 1))

In [19]:
X_test_sc = sc_X.transform(X_test)
y_test_sc = sc_y.transform(np.array(y_test).reshape(-1, 1))

In [28]:
X_train_sc = pd.DataFrame(X_train_sc,columns=X.columns)

X_test_sc = pd.DataFrame(X_test_sc,columns=X.columns)

y_train_sc = pd.Series(y_train_sc.reshape(len(y_train_sc)),name='AQI')

y_test_sc = pd.Series(y_test_sc.reshape(len(y_test_sc)),name='AQI')

In [29]:
X_train_sc

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Time_Season_Mean
0,0.030411,0.069571,0.007003,0.034083,0.023223,0.039275,0.003803,0.020374,0.061693,0.001502,0.006868,0.835678
1,0.027551,0.059651,0.007561,0.033282,0.023100,0.036620,0.004203,0.021282,0.060292,0.001502,0.009091,0.813212
2,0.024930,0.053591,0.005679,0.029719,0.019975,0.035817,0.006005,0.021534,0.065296,0.001395,0.008410,0.796129
3,0.022930,0.049911,0.006585,0.028338,0.019792,0.034664,0.002602,0.020828,0.058741,0.001261,0.009271,0.765664
4,0.022560,0.049746,0.006655,0.026957,0.019056,0.033615,0.003002,0.021433,0.051186,0.001288,0.007829,0.711153
...,...,...,...,...,...,...,...,...,...,...,...,...
31319,0.023710,0.109972,0.019268,0.041319,0.040319,0.042960,0.021217,0.028090,0.059992,0.001368,0.007209,0.395839
31320,0.017200,0.079322,0.013763,0.029829,0.029167,0.043145,0.013411,0.031217,0.070399,0.000644,0.005607,0.370973
31321,0.029491,0.050131,0.014494,0.026211,0.027267,0.044010,0.011809,0.034344,0.073051,0.000885,0.005987,0.323271
31322,0.021560,0.037501,0.012717,0.022731,0.023897,0.043145,0.011609,0.033184,0.077454,0.000617,0.005306,0.306358


## Saving Scaler and the scaled data

In [30]:
import pickle

In [31]:
with open('Data/X_scaler.pkl', 'wb') as handle:
    pickle.dump(sc_X, handle)

In [32]:
with open('Data/y_scaler.pkl', 'wb') as handle:
    pickle.dump(sc_y, handle)

In [33]:
X_train_sc.to_csv('Data/Final/X_train_sc.csv',index=False)
y_train_sc.to_csv('Data/Final/y_train_sc.csv',index=False)

X_test_sc.to_csv('Data/Final/X_test_sc.csv',index=False)
y_test_sc.to_csv('Data/Final/y_test_sc.csv',index=False)

## Creating Time Series Data 

In [34]:
def create_timeseries_dataset(dataset,time_step=1):
    x,y = [],[]
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step)]
        #print(dataset)
        x.append(a)
        y.append(dataset[i+time_step,-1])
    return np.array(x),np.array(y)

In [36]:
train_sc = np.concatenate((X_train_sc,np.array(y_train_sc).reshape(-1,1)),axis=1)
test_sc = np.concatenate((X_test_sc,np.array(y_test_sc).reshape(-1,1)),axis=1)

In [37]:
train_sc.shape,test_sc.shape

((31324, 13), (16868, 13))

In [38]:
time_step = 10
X_train,y_train = create_timeseries_dataset(train_sc,time_step=time_step)
X_test,y_test = create_timeseries_dataset(test_sc,time_step=time_step)

In [39]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((31313, 10, 13), (16857, 10, 13), (31313,), (16857,))

In [40]:
def flatten_timeseries_data(dataset):
    new_data = []
    for i in range(len(dataset)):
        x = dataset[i].flatten()
        new_data.append(x)
    new_data = np.array(new_data)
    return new_data

In [41]:
X_train = flatten_timeseries_data(X_train)
X_test = flatten_timeseries_data(X_test)

In [42]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((31313, 130), (16857, 130), (31313,), (16857,))

In [43]:
## Reshape input to be [samples,time_steps,features] which is required for LSTM

X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)

In [44]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((31313, 130, 1), (16857, 130, 1), (31313,), (16857,))

In [46]:
np.save('Data/Final/Time series/X_train.npy', X_train)
np.save('Data/Final/Time series/X_test.npy', X_test)
np.save('Data/Final/Time series/y_train.npy', y_train)
np.save('Data/Final/Time series/y_test.npy', y_test)

In [50]:
X_train_ld = np.load('Data/Final/Time series/y_test.npy')

In [51]:
X_train_ld.shape

(16857,)