In [None]:
!pip install catboost geopy mlxtend

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import tensorflow as tf

import random
import torch

from catboost import CatBoostRegressor, Pool

import sklearn
from sklearn.linear_model import LinearRegression
from xgboost import XGBRFRegressor
from lightgbm import LGBMRegressor


from mlxtend.regressor import StackingCVRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LassoCV,LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold,GroupKFold

RANDOM_STATE = 42

def seed_everything(seed):
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['PYTHONSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(RANDOM_STATE)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [None]:
root_dir = '../input/airquality/'
TRAIN_FILE = 'Train.csv'
TEST_FILE = 'Test.csv'
SAMPLE_SUB = 'SampleSubmission.csv'

In [None]:
def read_df(csv):
    return pd.read_csv(root_dir+csv)

In [None]:
train = read_df(TRAIN_FILE)
test = read_df(TEST_FILE)
sample_submission = read_df(SAMPLE_SUB)

In [None]:
train

In [None]:
sample_submission

In [None]:
test.isna().sum()

In [None]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df.shape

In [None]:
df.info()

In [None]:
df['lat'] = df['lat'].apply(lambda x : np.radians(x))
df['long'] = df['long'].apply(lambda x : np.radians(x))

In [None]:
cat_col = ['site', 'altitude','greenness', 'landform_90m','landform_270m']
target_col = 'ref_pm2_5'
num_cols = [col for col in train.columns if col not in cat_col + ['ID','created_at']]

In [None]:
print(num_cols)

In [None]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points


from subprocess import check_output
print(check_output(["ls", "."]).decode("utf8")) #check the files available in the directory

In [None]:
# Check the skew of all numerical features
skewed_feats = df[num_cols].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
# lam = 0.15
# for feat in skewed_features:
#     df[feat] = boxcox1p(df[feat], lam)
    
df[skewed_features] = np.log1p(df[skewed_features])

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [None]:
for date_feature in ['year','month','dayofyear' ,'day','is_month_start','is_month_end','hour','quarter','weekday']:
    df[date_feature] = getattr(df['created_at'].dt, date_feature).astype('float')

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# df['date'] = df['created_at'].dt.date


# cal = calendar()
# holidays = cal.holidays(start =df['date'].min(), end=df['date'].max())

df['is_weekend'] = df['weekday'].isin([6.0, 7.0]).astype(int)

In [None]:
df['is_weekend'].value_counts()

In [None]:
for col in ['humidity','temp']:
    df[col] = df[col].fillna(df.groupby(['month','hour'])[col].transform('mean'))

In [None]:
le = sklearn.preprocessing.LabelEncoder()

for col in cat_col:
    df[col] = df[col].astype('category')
    le.fit(list(df[col].values)) 

    df[col] = le.fit_transform(list(df[col].values))

# df = df.drop(columns=cat_col, axis=1)

In [None]:
for date_feature in ['day', 'dayofyear']:

    max_val = df[col].max()

    df[date_feature+'_sin'] = np.sin(2 * np.pi * df[date_feature] / max_val)
    df[date_feature+'_cos'] = np.cos(2 * np.pi * df[date_feature] / max_val)

In [None]:
df = pd.get_dummies(df, columns=cat_col)

In [None]:
train_df = df[df['ID'].isin(train['ID'].values)]
test_df =  df[~df['ID'].isin(train['ID'].values)]

In [None]:
print(train_df.shape, test_df.shape)

In [None]:
df_statistics = train_df.groupby(by="created_at",as_index=False).agg({'temp':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del train_df['created_at_By_Created_At_']


df_statistics = train_df.groupby(by="created_at",as_index=False).agg({'humidity':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del train_df['created_at_By_Created_At_']


df_statistics = train_df.groupby(by="created_at",as_index=False).agg({'pm2_5':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del train_df['created_at_By_Created_At_']

df_statistics = train_df.groupby(by="created_at",as_index=False).agg({'s2_pm2_5':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del train_df['created_at_By_Created_At_']


df_statistics = train_df.groupby(by="created_at",as_index=False).agg({'pm10':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del train_df['created_at_By_Created_At_']


df_statistics = test_df.groupby(by="created_at",as_index=False).agg({'temp':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del test_df['created_at_By_Created_At_']


df_statistics = test_df.groupby(by="created_at",as_index=False).agg({'humidity':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del test_df['created_at_By_Created_At_']

df_statistics = test_df.groupby(by="created_at",as_index=False).agg({'pm2_5':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del test_df['created_at_By_Created_At_']

df_statistics = test_df.groupby(by="created_at",as_index=False).agg({'s2_pm2_5':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del test_df['created_at_By_Created_At_']


df_statistics = test_df.groupby(by="created_at",as_index=False).agg({'pm10':['mean','min','max'],})
df_statistics.columns = ["_By_Created_At_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="created_at_By_Created_At_",left_on="created_at")

del test_df['created_at_By_Created_At_']

df_statistics = train_df.groupby(by="month",as_index=False).agg({'humidity':['mean','min','max'],})
df_statistics.columns = ["_By_Month_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="month_By_Month_",left_on="month")

del train_df['month_By_Month_']

df_statistics = train_df.groupby(by="month",as_index=False).agg({'temp':['mean','min','max'],})
df_statistics.columns = ["_By_Month_".join(x) for x in df_statistics.columns.ravel()]
train_df = pd.merge(train_df, df_statistics,how="left",right_on="month_By_Month_",left_on="month")

del train_df['month_By_Month_']

df_statistics = test_df.groupby(by="month",as_index=False).agg({'humidity':['mean','min','max'],})
df_statistics.columns = ["_By_Month_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="month_By_Month_",left_on="month")

del test_df['month_By_Month_']

df_statistics = test_df.groupby(by="month",as_index=False).agg({'temp':['mean','min','max'],})
df_statistics.columns = ["_By_Month_".join(x) for x in df_statistics.columns.ravel()]
test_df = pd.merge(test_df, df_statistics,how="left",right_on="month_By_Month_",left_on="month")

del test_df['month_By_Month_']

In [None]:
feature_cols = train_df.columns.difference(cat_col + ['ID', 'created_at','fold', target_col])
feature_cols

In [None]:
train_df.isna().sum()

In [None]:
print(train_df.shape,test_df.shape)

In [None]:
# def make_folds(df):

#   df['fold'] = -1

#   df = df.sample(frac=1).reset_index(drop=True)

#   num_bins = np.floor(1 + np.log2(len(df)))

#   df.loc[:, 'bins'] = pd.cut(df[target_col], bins= int(num_bins), labels = False)

#   kf = sklearn.model_selection.StratifiedKFold(n_splits = 5,shuffle =True, random_state = RANDOM_STATE )

#   for f,(trn_id, val_id) in enumerate(kf.split(X = df, y = df['bins'].values )):

#     df.loc[val_id, 'fold'] = int(f)

#   df = df.drop('bins',axis = 1)

#   return df

In [None]:
train_df = make_folds(train_df)

In [None]:
feature_cols = train_df.columns.difference(['ID', 'created_at','fold', target_col])

In [None]:
from sklearn.cluster import KMeans

def fe_cluster(train, test, n_clusters=15, SEED=42):

    features_c = feature_cols
    
    def create_cluster(train, test, features, kind='c', n_clusters=n_clusters):

        train = train.fillna(0)
        test =  test.fillna(0)
        
        train_ = train[features].copy()
        test_ = test[features].copy()
        kmeans = KMeans(random_state = SEED, n_clusters = n_clusters)
        
        kmeans.fit(pd.concat((train_, test_), axis=0).reset_index(drop=True))

        train[f'clusters_{kind}'] = kmeans.predict(train_.values)
        test[f'clusters_{kind}'] = kmeans.predict(test_.values)

        train = pd.get_dummies(train, columns=[f'clusters_{kind}'])
        test = pd.get_dummies(test, columns=[f'clusters_{kind}'])
        
        return train, test

   # train, test = create_cluster(train, test, features_g, kind = 'g', n_clusters = n_clusters_g)
    train, test = create_cluster(
        train, test, features_c, kind='c', n_clusters=n_clusters)
    return train, test

train_df, test_df= fe_cluster(train_df, test_df)

In [None]:
train_df[target_col]

In [None]:
final_feature_cols = train_df.columns.difference(cat_col+['ID', 'created_at','fold', target_col])

In [None]:
len(final_feature_cols)

In [None]:
# train_df[target_col] = np.log2(train_df[target_col])

In [None]:
# train_df['fold'].value_counts()

Modelling ....

In [None]:
# train_df['day'] =  train_df['day'].astype(int)
# train_df['hour'] =  train_df['hour'].astype(int)
# train_df['month'] =  train_df['month'].astype(int)

In [None]:
# fold_group = train_df['month'].astype(str) + '_' + train_df['hour'].astype(str)

In [None]:
# fold_group

In [None]:
import keras
from keras.layers import *
from keras.optimizers import Adam
from keras.models import Model
from keras.initializers import glorot_normal
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from keras.regularizers import l2
import tensorflow as tf
import random
from sklearn.neural_network import MLPRegressor

In [None]:
def get_catboost():
    return CatBoostRegressor(loss_function='RMSE',learning_rate = 0.05,random_state=RANDOM_STATE, n_estimators=2000)

lin_reg = LinearRegression(normalize =True,fit_intercept = False)
svr = SVR(C = 1,kernel='poly', degree = 5)

lasso = LassoCV(
  alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,0.3, 0.6, 1]
  ,max_iter=1000 
  ,tol = 5
  ,random_state=RANDOM_STATE,
  fit_intercept = False
  ,cv= 5
  ,verbose=True,
  normalize = True,
  n_jobs = -1
)

params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'reg:squarederror', 
            'eval_metric': 'rmse', 
             
            'booster': 'gbtree', 
            
            'n_estimators': 10000, 
            'tree_method': 'gpu_hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 8,
             
            
            'learning_rate': 0.01, 
            'max_bin': 200, 
            'max_leaves': 200, 
            
            'reg_alpha': 10, 
            'reg_lambda': 50, 
            'subsample': 0.9 ,
          }

dt_meta = DecisionTreeRegressor(random_state = RANDOM_STATE)

rf = RandomForestRegressor(random_state=RANDOM_STATE, n_estimators = 1000, verbose=RANDOM_STATE)

xgb_regressor = XGBRegressor(**params,
                             random_state= RANDOM_STATE,verbose=RANDOM_STATE)

lgbm_regressor = LGBMRegressor(objective ='regression',
                               #importance_type='weight',
                               boosting_type='rf',bagging_fraction=0.8,bagging_freq = 1,
                               n_leaves =31, n_estimators= 3000, learning_rate =0.05,
                               random_state=RANDOM_STATE, metric='rmse',verbose=RANDOM_STATE)

cat_boost = get_catboost()

ada_boost = AdaBoostRegressor(dt_meta,random_state = RANDOM_STATE,n_estimators = 500)

forecaster = StackingCVRegressor(regressors=(lin_reg,lasso,svr,lgbm_regressor,
                                             xgb_regressor,cat_boost, ada_boost, rf),
                            meta_regressor= lin_reg,
                            shuffle = True,
                            cv = 10,
                            use_features_in_secondary=True
                            )

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_threshold = VarianceThreshold()


X = train_df[final_feature_cols].values
y = train_df[target_col]

xtest = test_df[final_feature_cols].values

# X = var_threshold.fit_transform(X)
# xtest = var_threshold.transform(xtest)

In [None]:
forecaster.fit(X, y)

In [None]:
preds = forecaster.predict(xtest)

In [None]:
test_df[target_col] = np.expm1(preds)
sub_df = test_df[['ID',target_col]]

sub_df.to_csv('airqo_stackingcv_10kfolds_scaled_target.csv', index=False)

In [None]:
len(final_feature_cols)

In [None]:
def build_simple_model():
    
    payments_inp = keras.Input(shape=(74), name="payments_inputs")

    x = keras.layers.Dense(64, activation="relu", name="x2")(payments_inp)
    x_out = keras.layers.Dense(128, activation="relu", name="payment_out")(x)
    x_out = keras.layers.Dense(256, activation="relu", name="payment_out2")(x_out)

    x_out = keras.layers.Dropout(0.5)(x_out)
    
    out = keras.layers.Dense(1, name="out")(x_out)
    
    # Model
    model = keras.Model(inputs=[payments_inp], outputs=[out])
    
    opt = keras.optimizers.Adam(learning_rate=0.001)
    
    model.compile(
        loss="mse",
        optimizer=opt,
        metrics = [tf.keras.metrics.RootMeanSquaredError(name='rmse')]
    )

    model.summary()
    
    return model

In [None]:
seed_everything(RANDOM_STATE)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model_preds = list()

for train_index, test_index in KFold(n_splits=10, shuffle=True, random_state=42).split(train_df, y):

    X_train, X_val = train_df.loc[train_index], train_df.loc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    model = build_simple_model()

    model.fit(X_train[final_feature_cols].values, 
            y_train,
            epochs = 200, 
            batch_size = 64,
            validation_data = (X_val[final_feature_cols].values, y_val),
            callbacks = EarlyStopping(monitor='val_loss', patience=30,
                                      mode='min',restore_best_weights=True)
            )

    prediction = model.predict(xtest)
    
    model_preds.append(prediction)

In [None]:
nn_preds = np.mean(model_preds, axis=0)
# blend = 0.8*stacked_preds.values + 0.2*nn_preds

In [None]:
test_df[target_col] = np.expm1(nn_preds)
sub_df = test_df[['ID',target_col]]

sub_df.to_csv('airqo_nn_10kfolds_scaled_target.csv', index=False)

In [None]:
sub_df