In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLassoCV, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
import tensorflow as tf
from sklearn.neural_network import MLPRegressor

In [None]:
df_aqi = pd.read_csv('/Users/nikmag/Desktop/USC/Research/SSI/processed_files/processed_los_angeles_aqi_14_months.csv')
df_aqi.index = df_aqi['timestamp'].apply(pd.Timestamp)
del df_aqi['timestamp']

In [None]:
df_meo = pd.read_csv('/Users/nikmag/Desktop/USC/Research/SSI/processed_files/processed_los_angeles_weather_14_months.csv')
df_meo.index = df_meo['timestamp'].apply(pd.Timestamp)
del df_meo['timestamp']

In [None]:
sensors = ['W San Gabriel Vly',
 'San Gabriel Mts',
 'SW San Bernardino',
 'Southeast LA CO',
 'South Coastal LA',
 'Central LA CO',
 'Santa Clarita Vly',
 'W San Fernando Vly',
 'E San Gabriel V-2',
]

In [None]:
aqi_cols = [col for col in df_aqi.columns if col.endswith(('Antelope Vly','SW Coastal LA','NW Coastal LA','E San Fernando Vly'))]

In [None]:
meo_cols = [col for col in df_meo.columns if col.endswith(('Antelope Vly','SW Coastal LA','NW Coastal LA','E San Fernando Vly'))]

In [None]:
df_aqi.drop(columns=aqi_cols,axis=1,inplace=True)
df_meo.drop(columns=meo_cols,axis=1,inplace=True)

In [None]:
df = pd.merge(df_aqi,df_meo,how='left',left_index=True,right_index=True)

In [None]:
df.replace(0.0,np.NaN,inplace=True)

In [None]:
def masked_mse_tf(preds, labels, null_val=np.nan):
    """
    Accuracy with masking.
    :param preds:
    :param labels:
    :param null_val:
    :return:
    """
    if np.isnan(null_val):
        mask = ~tf.is_nan(labels)
    else:
        mask = tf.not_equal(labels, null_val)
    mask = tf.cast(mask, tf.float32)
    mask /= tf.reduce_mean(mask)
    mask = tf.where(tf.is_nan(mask), tf.zeros_like(mask), mask)
    loss = tf.square(tf.subtract(preds, labels))
    loss = loss * mask
    loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
    return tf.reduce_mean(loss)


def masked_mae_tf(preds, labels, null_val=np.nan):
    """
    Accuracy with masking.
    :param preds:
    :param labels:
    :param null_val:
    :return:
    """
    if np.isnan(null_val):
        mask = ~tf.is_nan(labels)
    else:
        mask = tf.not_equal(labels, null_val)
    mask = tf.cast(mask, tf.float32)
    mask /= tf.reduce_mean(mask)
    mask = tf.where(tf.is_nan(mask), tf.zeros_like(mask), mask)
    loss = tf.abs(tf.subtract(preds, labels))
    loss = loss * mask
    loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
    return tf.reduce_mean(loss)


def masked_rmse_tf(preds, labels, null_val=np.nan):
    """
    Accuracy with masking.
    :param preds:
    :param labels:
    :param null_val:
    :return:
    """
    return tf.sqrt(masked_mse_tf(preds=preds, labels=labels, null_val=null_val))


def masked_rmse_np(preds, labels, null_val=np.nan):
    return np.sqrt(masked_mse_np(preds=preds, labels=labels, null_val=null_val))


def masked_mse_np(preds, labels, null_val=np.nan):
    with np.errstate(divide='ignore', invalid='ignore'):
        if np.isnan(null_val):
            mask = ~np.isnan(labels)
        else:
            mask = np.not_equal(labels, null_val)
        mask = mask.astype('float32')
        mask /= np.mean(mask)
        rmse = np.square(np.subtract(preds, labels)).astype('float32')
        rmse = np.nan_to_num(rmse * mask)
        return np.mean(rmse)


def masked_mae_np(preds, labels, null_val=np.nan):
    with np.errstate(divide='ignore', invalid='ignore'):
        if np.isnan(null_val):
            mask = ~np.isnan(labels)
        else:
            mask = np.not_equal(labels, null_val)
        mask = mask.astype('float32')
        mask /= np.mean(mask)
        mae = np.abs(np.subtract(preds, labels)).astype('float32')
        mae = np.nan_to_num(mae * mask)
        return np.mean(mae)


def masked_mape_np(preds, labels, null_val=np.nan):
    with np.errstate(divide='ignore', invalid='ignore'):
        if np.isnan(null_val):
            mask = ~np.isnan(labels)
        else:
            mask = np.not_equal(labels, null_val)
        mask = mask.astype('float32')
        mask /= np.mean(mask)
        mape = np.abs(np.divide(np.subtract(preds, labels).astype('float32'), labels))
        mape = np.nan_to_num(mask * mape)
        return np.mean(mape)


# Builds loss function.
def masked_mse_loss(scaler, null_val):
    def loss(preds, labels):
        if scaler:
            preds = scaler.inverse_transform(preds)
            labels = scaler.inverse_transform(labels)
        return masked_mse_tf(preds=preds, labels=labels, null_val=null_val)

    return loss


def masked_rmse_loss(scaler, null_val):
    def loss(preds, labels):
        if scaler:
            preds = scaler.inverse_transform(preds)
            labels = scaler.inverse_transform(labels)
        return masked_rmse_tf(preds=preds, labels=labels, null_val=null_val)

    return loss


def masked_mae_loss(scaler, null_val):
    def loss(preds, labels):
        if scaler:
            preds = scaler.inverse_transform(preds)
            labels = scaler.inverse_transform(labels)
        mae = masked_mae_tf(preds=preds, labels=labels, null_val=null_val)
        return mae

    return loss


def calculate_metrics(df_pred, df_test, null_val):
    """
    Calculate the MAE, MAPE, RMSE
    :param df_pred:
    :param df_test:
    :param null_val:
    :return:
    """
    mape = masked_mape_np(preds=df_pred.as_matrix(), labels=df_test.as_matrix(), null_val=null_val)
    mae = masked_mae_np(preds=df_pred.as_matrix(), labels=df_test.as_matrix(), null_val=null_val)
    rmse = masked_rmse_np(preds=df_pred.as_matrix(), labels=df_test.as_matrix(), null_val=null_val)
    return [mae, mape, rmse]

### Spatial Model

In [None]:
pred_ann = {k:[] for k in range(1,13)}
gt = {k:[] for k in range(1,13)}

for sensor in sensors:
    
    start = time.time()
    
    #current AQI and Weather data
    df_temp = df.loc[:,[i for i in df.columns.tolist()]]
    
    #next 48 hour prediction
    for j in range(1,49):
        df_temp['hz{}'.format(j)] = df_temp['aqi_%s' % sensor]
        df_temp['hz{}'.format(j)] = df_temp['hz{}'.format(j)].shift(-j)
        
    df_temp['hz7_12_min'] = df_temp[['hz{}'.format(i) for i in range(7,13)]].min(1)
    df_temp['hz7_12_max'] = df_temp[['hz{}'.format(i) for i in range(7,13)]].max(1)
    
    df_temp['hz13_24_min'] = df_temp[['hz{}'.format(i) for i in range(13,25)]].min(1)
    df_temp['hz13_24_max'] = df_temp[['hz{}'.format(i) for i in range(13,25)]].max(1)
    
    df_temp['hz25_48_min'] = df_temp[['hz{}'.format(i) for i in range(25,49)]].min(1)
    df_temp['hz25_48_max'] = df_temp[['hz{}'.format(i) for i in range(25,49)]].max(1)
    
    df_temp.drop(columns=['hz{}'.format(i) for i in range(7,49)],axis=1,inplace=True)
    
    df_temp.drop(columns=[col for col in df_temp if col.endswith(sensor)],axis=1,inplace=True)
    
    #past 12 hours AQI data
    sensors1 = [s for s in sensors if s!= sensor]
    for k in sensors1:
        for col in ['aqi']: 
            for j in range(1,13):
                df_temp['p{}_{}_{}'.format(j,k,col)] = df_temp[col + '_%s' % k]
                df_temp['p{}_{}_{}'.format(j,k,col)] = df_temp['p{}_{}_{}'.format(j,k,col)].shift(j)
    
    y_cols = [i for i in df_temp.columns.tolist() if i.startswith('hz')]
    x_cols = [i for i in df_temp.columns.tolist() if i.startswith('hz')==False]
    
    X = df_temp.loc[:,x_cols]    
    y = df_temp.loc[:,y_cols]
    
    X_train = X.loc[:pd.Timestamp('2017-10-30 23:00:00'),:]
    y_train = y.loc[:pd.Timestamp('2017-10-30 23:00:00'),:]
    X_test = X.loc[pd.Timestamp('2017-11-01 00:00:00'):pd.Timestamp('2017-12-31 23:00:00'),:]
    y_test = y.loc[pd.Timestamp('2017-11-01 00:00:00'):pd.Timestamp('2017-12-31 23:00:00'),:]
    
    train = pd.concat([X_train,y_train],axis=1)
    train.dropna(axis=0,how='any',inplace=True)
    X_train = train.loc[:,x_cols]
    y_train = train.loc[:,y_cols]
    
    X_test.fillna(X_test.mean(),inplace=True)

    model = MLPRegressor(hidden_layer_sizes=(100),max_iter=250,learning_rate='adaptive')
    model.fit(X_train,y_train)
    
    predictions = model.predict(X_test)
    actual = np.array(y_test)
    
    j=0
    for i in range(1,13):
        pred_ann[i].append(predictions[:,j].tolist())
        gt[i].append(actual[:,j].tolist())
        j = j + 1
        
    end = time.time()
    
    print(sensor+'='+str(np.round(end-start,2)))

### Temporal Model

In [None]:
pred_lr = {k:[] for k in range(1,13)}
#gt = {k:[] for k in range(1,13)}

for sensor in sensors:
    
    start = time.time()
    
    #current AQI
    df_temp = df.loc[:,[i for i in df.columns.tolist() if i.endswith(sensor)]]
    
    #past 12 hours data
    for col in ['aqi','pressure','wind_speed','cloud_cover','visibility','wind_bearing','humidity','temperature']: 
        for j in range(1,13):
            df_temp['p{}_{}'.format(j,col)] = df_temp[col + '_%s' % sensor]
            df_temp['p{}_{}'.format(j,col)] = df_temp['p{}_{}'.format(j,col)].shift(j)
    
    #weather forecast for 48 hours
    for col in ['pressure','wind_speed','cloud_cover','visibility','wind_bearing','humidity','temperature']:
        for j in range(1,49):
            df_temp['f{}_{}'.format(j,col)] = df_temp[col + '_%s' % sensor]
            df_temp['f{}_{}'.format(j,col)] = df_temp['f{}_{}'.format(j,col)].shift(-j)

        df_temp['f7_12_min_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(7,13)]].min(1)
        df_temp['f7_12_max_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(7,13)]].max(1)
        df_temp['f13_24_min_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(13,25)]].min(1)
        df_temp['f13_24_max_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(13,25)]].max(1)
        df_temp['f25_48_min_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(25,49)]].min(1)
        df_temp['f25_48_max_{}'.format(col)] = df_temp[['f{}_{}'.format(k,col) for k in range(25,49)]].max(1)
        
        df_temp.drop(columns=['f{}_{}'.format(k,col) for k in range(7,49)],axis=1,inplace=True)
        
    #HourOfDay and DayOfWeek
    df_temp['hourofday'] = df_temp.index.hour
    df_temp['dayofweek'] = df_temp.index.dayofweek
    df_temp['hourofday'] = df_temp['hourofday'].apply(str)
    df_temp['dayofweek'] = df_temp['dayofweek'].apply(str)
    
    #next 48 hour prediction
    for j in range(1,49):
        df_temp['hz{}'.format(j)] = df_temp['aqi_%s' % sensor]
        df_temp['hz{}'.format(j)] = df_temp['hz{}'.format(j)].shift(-j)
        
    df_temp['hz7_12_min'] = df_temp[['hz{}'.format(i) for i in range(7,13)]].min(1)
    df_temp['hz7_12_max'] = df_temp[['hz{}'.format(i) for i in range(7,13)]].max(1)
    
    df_temp['hz13_24_min'] = df_temp[['hz{}'.format(i) for i in range(13,25)]].min(1)
    df_temp['hz13_24_max'] = df_temp[['hz{}'.format(i) for i in range(13,25)]].max(1)
    
    df_temp['hz25_48_min'] = df_temp[['hz{}'.format(i) for i in range(25,49)]].min(1)
    df_temp['hz25_48_max'] = df_temp[['hz{}'.format(i) for i in range(25,49)]].max(1)
    
    df_temp.drop(columns=['hz{}'.format(i) for i in range(7,49)],axis=1,inplace=True)
    
    y_cols = [i for i in df_temp.columns.tolist() if i.startswith('hz')]
    x_cols = [i for i in df_temp.columns.tolist() if i.startswith('hz')==False]
    
    X = df_temp.loc[:,x_cols]    
    y = df_temp.loc[:,y_cols]
    
    X_train = X.loc[:pd.Timestamp('2017-10-30 23:00:00'),:]
    y_train = y.loc[:pd.Timestamp('2017-10-30 23:00:00'),:]
    X_test = X.loc[pd.Timestamp('2017-11-01 00:00:00'):pd.Timestamp('2017-12-31 23:00:00'),:]
    y_test = y.loc[pd.Timestamp('2017-11-01 00:00:00'):pd.Timestamp('2017-12-31 23:00:00'),:]
    
    train = pd.concat([X_train,y_train],axis=1)
    train.dropna(axis=0,how='any',inplace=True)
    X_train = train.loc[:,x_cols]
    y_train = train.loc[:,y_cols]
    
    X_test.fillna(X_test.mean(),inplace=True)

    model = MultiTaskLassoCV(alphas=[0.1,1,10],cv=3)
    model.fit(X_train,y_train)
    
    predictions = model.predict(X_test)
    actual = np.array(y_test)
    
    j=0
    for i in range(1,13):
        pred_lr[i].append(predictions[:,j].tolist())
        #gt[i].append(actual[:,j].tolist())
        j = j + 1
        
    end = time.time()
    
    print(sensor+'='+str(np.round(end-start,2)))

### Prediction Aggregator

In [None]:
df_gt = pd.DataFrame()
for h in range(1,13):
    df_gt = pd.concat([df_gt,pd.DataFrame(np.array(gt[h]).T)\
              .rename(columns = {i: "{}_".format(h) + sensor for i,sensor in enumerate(sensors)})],axis = 1)

In [None]:
df_ann = pd.DataFrame()
for h in range(1,13):
    df_ann = pd.concat([df_ann,pd.DataFrame(np.array(pred_ann[h]).T)\
               .rename(columns = {i: "{}_".format(h) + sensor for i,sensor in enumerate(sensors)})],axis = 1)

In [None]:
df_lr = pd.DataFrame()
for h in range(1,13):
    df_lr = pd.concat([df_lr,pd.DataFrame(np.array(pred_lr[h]).T)\
              .rename(columns = {i: "{}_".format(h) + sensor for i,sensor in enumerate(sensors)})],axis = 1)

In [None]:
for col in df_gt.columns.tolist():
    X_train = pd.concat([df_ann[col],df_lr[col]],axis=1)
    y_train = df_gt[col]
    X_test = pd.concat([df_test_ann[col],df_test_lr[col]],axis=1)
    
    X_test.fillna(X_test.mean(),inplace=True)
    
    train = pd.concat([X_train,y_train],axis=1)
    train.dropna(axis=0,how='any',inplace=True)
    X_train = train.iloc[:,[0,1]]
    y_train = train.iloc[:,2]
    
    model = DecisionTreeRegressor()
    model.fit(X_train,y_train)
    
    df_test_pred[col] = model.predict(X_test)

In [None]:
for i in range(1,13):
    cols = [col for col in df_test_pred.columns.tolist() if col.startswith(str(i)+"_")]
    res = calculate_metrics(df_test_pred[cols],df_test_gt[cols],np.nan)
    print(res[0])
    print(res[2])
    print(res[1])