# Layer.ai Air Quality Prediction Challenge


## Install Layer

In [1]:
pip install -U layer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting layer
  Downloading layer-0.10.2597586054-py3-none-any.whl (150 kB)
[K     |████████████████████████████████| 150 kB 5.0 MB/s 
[?25hCollecting yarl>=1.6.3
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)
[K     |████████████████████████████████| 271 kB 58.2 MB/s 
[?25hCollecting polling>=0.3.1
  Downloading polling-0.3.2.tar.gz (5.2 kB)
Collecting humanize>=3.11.0
  Downloading humanize-4.2.3-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 38.9 MB/s 
Collecting layer-api==0.9.368780
  Downloading layer_api-0.9.368780-py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 53.7 MB/s 
[?25hCollecting nvsmi<0.5.0,>=0.4.2
  Downloading nvsmi-0.4.2-py3-none-any.whl (5.5 kB)
Collecting pyjwt<3.0.0,>=2.0.0
  Downloading PyJWT-2.4.0-py3-none-any.whl (18

In [2]:
import layer

In [3]:
from layer.decorators import dataset,model, pip_requirements

In [4]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from fastai.tabular import *
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold, StratifiedKFold
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import datetime
from tqdm.notebook import tqdm as tqdm_notebook

In [5]:
from functools import partial
import re
import calendar
import matplotlib.pyplot as plt
def add_cyclic_datepart(df, field_name:str, prefix:str=None, drop:bool=True, time:bool=False, add_linear:bool=False):
    "Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
    columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
    df_feats = pd.DataFrame([item for item in series], columns=columns, index=series.index)
    for column in columns: df[column] = df_feats[column]
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df
def make_date(df, date_field:str):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
def ifnone(a,b):
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a
def cyclic_dt_features(d, time:bool=True, add_linear:bool=False):
    "Calculate the cos and sin of date/time cycles."
    tt,fs = d.timetuple(), [np.cos, np.sin]
    day_year,days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
    days_year = 366 if calendar.isleap(d.year) else 365
    rs = d.weekday()/7, (d.day-1)/days_month, (d.month-1)/12, (day_year-1)/days_year
    feats = [f(r * 2 * np.pi) for r in rs for f in fs]
    """if time and isinstance(d, datetime) and type(d) != date:
        rs = tt.tm_hour/24, tt.tm_hour%12/12, tt.tm_min/60, tt.tm_sec/60
        feats += [f(r * 2 * np.pi) for r in rs for f in fs]"""
    """if add_linear:
        if type(d) == date: feats.append(d.year + rs[-1])
        else:
            secs_in_year = (datetime(d.year+1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()
            feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))"""
    return feats
def cyclic_dt_feat_names(time:bool=True, add_linear:bool=False):
    "Return feature names of date/time cycles as produced by `cyclic_dt_features`."
    fs = ['cos','sin']
    attr = [f'{r}_{f}' for r in 'weekday day_month month_year day_year'.split() for f in fs]
    if time: attr += [f'{r}_{f}' for r in 'hour clock min sec'.split() for f in fs]
    if add_linear: attr.append('year_lin')
    return attr

In [31]:
# this function will help to reduce momory 
# data will be smaller with the same value

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        
        col_type = df[col].dtype
        if col_type == "category" : pass
        elif col_type != object :
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
            else: pass
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Log in to your Layer account

In [16]:
layer.login()

## Create a Layer project

In [17]:
layer.init("sample-air-quality")

Your Layer project is here: https://app.layer.ai/siwar/sample-air-quality

In [18]:
#sample_submission = layer.get_dataset("zindi/air-quality/datasets/sample_submission").to_pandas()
test_df = layer.get_dataset("zindi/air-quality/datasets/test").to_pandas()
train_df = layer.get_dataset("zindi/air-quality/datasets/train").to_pandas()

Output()

Output()

In [19]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y-%m-%d')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y-%m-%d')

ID_COL, TARGET_COL = 'ID', 'pm2_5'

In [20]:
features = [c for c in train_df.columns if c not in ['date', 'ID', 'device', 'pm2_5']]
simple_feats = [c for c in features if ('angle' not in c) & ('height' not in c) & ('altittude' not in c)]
len(simple_feats)

36

In [21]:
nan_cols = [c for c in train_df.columns if c not in ["ID","pm2_5", "device"]]
nan_cols = train_df[nan_cols].columns[train_df[nan_cols].isnull().any()].tolist()

for col in nan_cols:
    
    while train_df[col].isnull().sum()>0:
     
        train_df[col].fillna(train_df[["device", col]].groupby(["device"]).shift(periods=0).fillna(method='ffill', limit=1).fillna(method='bfill', limit=1)[col], inplace=True) 

    while test_df[col].isnull().sum()>0:
     
        test_df[col].fillna(test_df[["device", col]].groupby(["device"]).shift(periods=0).fillna(method='ffill', limit=1).fillna(method='bfill', limit=1)[col], inplace=True) 

test_df.isnull().sum()[test_df.isnull().sum()>0]

Series([], dtype: int64)

In [29]:
train_df['placeID_freq'] = train_df['device'].map(train_df['device'].value_counts())
"""for i in range(1, 20):
    train_df[f'prev_target_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()
    train_df[f'next_target_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()

for i in tqdm_notebook(range(1, 15)):
    train_df[f'magic_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()
    train_df[f'magic2_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()
    """
for i in tqdm_notebook(range(1, 45)):
    c = 'date' + str(i+1)
    train_df[c] = train_df['date']  + datetime.timedelta(days=i)
    _ = add_cyclic_datepart(train_df, c, prefix = c)
    
for i in tqdm_notebook(range(1, 11)):
    tmp = train_df.sort_values(by='date').groupby('device')[simple_feats].shift(i).sort_index()
    tmp_diff_prev = train_df[simple_feats] - tmp
    tmp.columns = [c + f'_prev_{i}' for c in tmp.columns]
    tmp_diff_prev.columns = [c + f'_prev_diff_{i}' for c in tmp_diff_prev.columns]
    train_df = pd.concat([train_df, tmp, tmp_diff_prev], axis=1)
    
    tmp = train_df.sort_values(by='date').groupby('device')[simple_feats].shift(-i).sort_index()
    tmp_diff_next = train_df[simple_feats] - tmp
    tmp.columns = [c + f'_next_{i}' for c in tmp.columns]
    tmp_diff_next.columns = [c + f'_next_diff_{i}' for c in tmp_diff_next.columns]
    train_df = pd.concat([train_df, tmp, tmp_diff_next], axis=1)
    
for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear']:
    train_df[attr] = getattr(train_df['date'].dt, attr)
train_df['is_weekend'] = (train_df['dayofweek'] >= 5)*1
train_df['fortnight'] = train_df['day']%15
train_df['which_fortnight'] = train_df['day']//15
#add_cyclic_datepart(train_df, "date", prefix = "Current_Date_")

"for i in range(1, 20):\n    train_df[f'prev_target_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()\n    train_df[f'next_target_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()\n\nfor i in tqdm_notebook(range(1, 15)):\n    train_df[f'magic_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()\n    train_df[f'magic2_{i}'] = train_df.sort_values(by='date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()\n    "

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



In [27]:
test_df['placeID_freq'] = test_df['device'].map(test_df['device'].value_counts())
"""for i in range(1, 20):
    test_df[f'prev_target_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()
    test_df[f'next_target_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()"""

"""for i in tqdm_notebook(range(1, 15)):
    test_df[f'magic_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()
    test_df[f'magic2_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()
    """
for i in tqdm_notebook(range(1, 45)):
    c = 'date' + str(i+1)
    test_df[c] = test_df['date']  + datetime.timedelta(days=i)
    _ = add_cyclic_datepart(test_df, c, prefix = c)
    
for i in tqdm_notebook(range(1, 11)):
    tmp = test_df.sort_values(by='date').groupby('device')[simple_feats].shift(i).sort_index()
    tmp_diff_prev = test_df[simple_feats] - tmp
    tmp.columns = [c + f'_prev_{i}' for c in tmp.columns]
    tmp_diff_prev.columns = [c + f'_prev_diff_{i}' for c in tmp_diff_prev.columns]
    test_df = pd.concat([test_df, tmp, tmp_diff_prev], axis=1)
    
    tmp = test_df.sort_values(by='date').groupby('device')[simple_feats].shift(-i).sort_index()
    tmp_diff_next = test_df[simple_feats] - tmp
    tmp.columns = [c + f'_next_{i}' for c in tmp.columns]
    tmp_diff_next.columns = [c + f'_next_diff_{i}' for c in tmp_diff_next.columns]
    test_df = pd.concat([test_df, tmp, tmp_diff_next], axis=1)
    
for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear']:
    test_df[attr] = getattr(test_df['date'].dt, attr)
test_df['is_weekend'] = (test_df['dayofweek'] >= 5)*1
test_df['fortnight'] = test_df['day']%15
test_df['which_fortnight'] = test_df['day']//15
#add_cyclic_datepart(test_df, "date", prefix = "Current_Date_")

"for i in range(1, 20):\n    test_df[f'prev_target_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()\n    test_df[f'next_target_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()"

"for i in tqdm_notebook(range(1, 15)):\n    test_df[f'magic_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()\n    test_df[f'magic2_{i}'] = test_df.sort_values(by='date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()\n    "

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



In [32]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 60.35 MB
Memory usage after optimization is: 38.52 MB
Decreased by 36.2%
Memory usage of dataframe is 25.88 MB
Memory usage after optimization is: 16.25 MB
Decreased by 37.2%


In [33]:
nan_cols = ['date', 'device','ID', 'pm2_5', 'month_year_cos','month_year_sin','day_year_cos','day_year_sin']

features = [c for c in train_df.columns if c not in nan_cols]

target = train_df[TARGET_COL]

len(features)

1872

In [34]:
import gc
gc.collect()

282

## Fetch the datasets as Pandas DataFrames

## Model training with Layer

In [35]:
@model("air_model3")
@pip_requirements(packages=["lightgbm","seaborn"])
def train():
    from sklearn.model_selection import train_test_split
    import lightgbm as lgb
    from sklearn import metrics
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    import matplotlib.pyplot as plt
    import seaborn as sns

    test_size = 0.30
    random_state = 0
    layer.log({"test_size":test_size})
    layer.log({"random_state":random_state})
    X = train_df.drop(["pm2_5", "ID","date"], axis=1)
    labelencoder = LabelEncoder()
    X = X.assign(device = labelencoder.fit_transform(X["device"]))

    y = train_df["pm2_5"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    params = {'boosting_type': 'gbdt',
              'objective': 'regression',
              'num_leaves': 40,
              'learning_rate': 0.1,
              'feature_fraction': 0.9
              }
    layer.log(params)
    categories = X.select_dtypes(include=['object']).columns.tolist() 
    feature_name = list(X.columns)
    evals={}
    gbm = lgb.train(
                params,
                lgb_train,
                num_boost_round=200,
                callbacks = [lgb.record_evaluation(evals)],
                valid_sets=[lgb_train, lgb_eval],
                valid_names=['train','valid']
    )
    predictions = gbm.predict(X_test)
    predctions_df = pd.DataFrame(predictions,columns=['Predicted pm2_5'])
    layer.log({"Sample predictions":predctions_df.head(100)})
    layer.log({'Mean Absolute Error':metrics.mean_absolute_error(y_test, predictions)})
    layer.log({'Mean Squarred Error': metrics.mean_squared_error(y_test, predictions)})
    layer.log({'Root Mean Squared Error': np.sqrt(metrics.mean_squared_error(y_test, predictions))})

    importance = gbm.feature_importance()
    importances_rfc_df = pd.DataFrame(importance, index=X.columns, columns=['Importance'])
    importances_rfc_df = importances_rfc_df.sort_values(by='Importance', ascending=False)
    importances_rfc_df = importances_rfc_df[importances_rfc_df['Importance'] > 0]
    importances_rfc_df = importances_rfc_df.head(10)

    plt.figure(figsize=(8,8))
    plt.xticks(rotation=60, fontsize = 20)
    sns.barplot(y=importances_rfc_df.index, x=importances_rfc_df['Importance'])
    layer.log({"Feature importance": plt.gcf()})
    layer.log({"Training metrics": lgb.plot_metric(evals)})

    return gbm

In [36]:
layer.run([train])

Output()

Run(id=value: "54812aa2-977d-4c1d-bf16-69d2dc7255b2"
, project_full_name=ProjectFullName(account_name='siwar', project_name='sample-air-quality'))

## Use the model to make predictions 

In [41]:
my_model = layer.get_model("siwar/sample-air-quality/models/air_model3:1.2").get_train()

Output()

In [42]:
from sklearn.preprocessing import LabelEncoder
X = test_df.drop(["ID","date"], axis=1)
labelencoder = LabelEncoder()
X = X.assign(device = labelencoder.fit_transform(X["device"]))

In [43]:
predictions = my_model.predict(X)

## Generate submission file

In [44]:
import pandas as pd
submission = pd.DataFrame(data=test_df["ID"], columns = ["ID"])
submission["pm2_5"] = predictions

In [45]:
submission.head()

Unnamed: 0,ID,pm2_5
0,ID_UOH62J0XHX,36.111023
1,ID_4OPWSB0UHJ,102.462394
2,ID_3SLMNNG1Z3,55.523602
3,ID_MFUHTXYPM4,73.747978
4,ID_TY1DAND8ZP,79.541626


In [46]:
submission.to_csv("submission.csv",index=False)

In [47]:
param = {'num_leaves': 100,
         'min_data_in_leaf': 50, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.075,
         "boosting": "gbdt",
         "feature_fraction": 0.35,
         "metric": 'auc',
         "lambda_l1": 1,
         "lambda_l2": 2,
         "random_state": 6,
         "verbosity": -1,
          'metric' : 'rmse',
          'num_iterations': 2600}

In [49]:
target_cols = ['pm2_5']
oofs_df = pd.DataFrame()
preds_df = pd.DataFrame()
for t_col in target_cols:
    oofs_df[t_col] = np.zeros(len(train_df))
    preds_df[t_col] = np.zeros(len(test_df))

In [51]:
import lightgbm as lgb

In [61]:
max_iter = 15
folds = StratifiedKFold(n_splits=max_iter, random_state=None)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, pd.qcut(target, 10, labels=False, duplicates='drop'))):
    print("\nfold n°{}".format(fold_))
    X_trn, X_val, X_test = train_df.iloc[trn_idx][features], train_df.iloc[val_idx][features], test_df[features]
    for t_col in target_cols:
        target = train_df[t_col]
        print(f"\n\n**** {t_col} ****\n")
        y_trn, y_val = target.iloc[trn_idx], target.iloc[val_idx]
        trn_data = lgb.Dataset(X_trn, y_trn)
        val_data = lgb.Dataset(X_val, y_val)
        
        clf = lgb.train(param, trn_data, valid_sets = [trn_data, val_data], verbose_eval=50, early_stopping_rounds = 200)

        oofs_df[t_col][val_idx] = clf.predict(X_val, num_iteration=clf.best_iteration)
        current_test_pred = clf.predict(X_test, num_iteration=clf.best_iteration)
        current_test_pred[current_test_pred < 0] = 0
        preds_df[t_col] += current_test_pred / folds.n_splits


fold n°0


**** pm2_5 ****





Training until validation scores don't improve for 200 rounds.
[50]	training's rmse: 9.66536	valid_1's rmse: 11.1783
[100]	training's rmse: 6.68061	valid_1's rmse: 10.7421
[150]	training's rmse: 5.21562	valid_1's rmse: 10.7345
[200]	training's rmse: 4.3781	valid_1's rmse: 10.7715
[250]	training's rmse: 3.85107	valid_1's rmse: 10.7958
[300]	training's rmse: 3.48013	valid_1's rmse: 10.8244
Early stopping, best iteration is:
[105]	training's rmse: 6.48764	valid_1's rmse: 10.7145

fold n°1


**** pm2_5 ****

Training until validation scores don't improve for 200 rounds.
[50]	training's rmse: 9.48029	valid_1's rmse: 13.1815
[100]	training's rmse: 6.54153	valid_1's rmse: 12.7519
[150]	training's rmse: 5.11889	valid_1's rmse: 12.7341
[200]	training's rmse: 4.30175	valid_1's rmse: 12.7487
[250]	training's rmse: 3.79195	valid_1's rmse: 12.7522
[300]	training's rmse: 3.43557	valid_1's rmse: 12.7445
Early stopping, best iteration is:
[118]	training's rmse: 5.93784	valid_1's rmse: 12.7249

fold n°

In [62]:
predictions_test = preds_df['pm2_5']
predictions_test[predictions_test < 0] = 0

In [63]:
SUB_FILE_NAME = 'preds_lgbm_v1.csv'
sub_df = pd.DataFrame()
sub_df[ID_COL] = test_df[ID_COL]
sub_df[TARGET_COL] = predictions_test
sub_df.to_csv(SUB_FILE_NAME, index=False)
sub_df.head(10)

Unnamed: 0,ID,pm2_5
0,ID_UOH62J0XHX,127.801553
1,ID_4OPWSB0UHJ,279.503297
2,ID_3SLMNNG1Z3,162.762354
3,ID_MFUHTXYPM4,208.278616
4,ID_TY1DAND8ZP,200.487524
5,ID_1G270NSQ7O,230.053438
6,ID_OLV7RAEKGP,198.088408
7,ID_2JQD8OKWXK,125.418916
8,ID_3PC4T4W9JW,150.936099
9,ID_TYK7E1D8DN,175.817257


In [64]:
sub_df.describe()

Unnamed: 0,pm2_5
count,4254.0
mean,162.920564
std,37.765807
min,84.143463
25%,132.174767
50%,160.945968
75%,188.564449
max,299.628786
