In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from fastai.tabular import *
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold, StratifiedKFold
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import datetime
from tqdm.notebook import tqdm as tqdm_notebook

In [2]:
from functools import partial
import re
import calendar
import matplotlib.pyplot as plt

def add_cyclic_datepart(df, field_name:str, prefix:str=None, drop:bool=True, time:bool=False, add_linear:bool=False):
    "Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
    columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
    df_feats = pd.DataFrame([item for item in series], columns=columns, index=series.index)
    for column in columns: df[column] = df_feats[column]
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df
def make_date(df, date_field:str):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
def ifnone(a,b):
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a
def cyclic_dt_features(d, time:bool=True, add_linear:bool=False):
    "Calculate the cos and sin of date/time cycles."
    tt,fs = d.timetuple(), [np.cos, np.sin]
    day_year,days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
    days_year = 366 if calendar.isleap(d.year) else 365
    rs = d.weekday()/7, (d.day-1)/days_month, (d.month-1)/12, (day_year-1)/days_year
    feats = [f(r * 2 * np.pi) for r in rs for f in fs]
    """if time and isinstance(d, datetime) and type(d) != date:
        rs = tt.tm_hour/24, tt.tm_hour%12/12, tt.tm_min/60, tt.tm_sec/60
        feats += [f(r * 2 * np.pi) for r in rs for f in fs]"""
    """if add_linear:
        if type(d) == date: feats.append(d.year + rs[-1])
        else:
            secs_in_year = (datetime(d.year+1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()
            feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))"""
    return feats
def cyclic_dt_feat_names(time:bool=True, add_linear:bool=False):
    "Return feature names of date/time cycles as produced by `cyclic_dt_features`."
    fs = ['cos','sin']
    attr = [f'{r}_{f}' for r in 'weekday day_month month_year day_year'.split() for f in fs]
    if time: attr += [f'{r}_{f}' for r in 'hour clock min sec'.split() for f in fs]
    if add_linear: attr.append('year_lin')
    return attr

In [3]:
# this function will help to reduce momory 
# data will be smaller with the same value

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

In [5]:
train = pd.read_csv('../input/urban-air-pollution/Train.csv')
test = pd.read_csv('../input/urban-air-pollution/Test.csv')

train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')

ID_COL, TARGET_COL = 'Place_ID X Date', 'target'

In [6]:
df = pd.concat([train, test]).reset_index(drop=True)
features = [c for c in df.columns if c not in ['Date', 'target_count', 'target_min', 'Place_ID X Date', 'target_variance', 'Place_ID', 'target_max', 'target']]
simple_feats = [c for c in features if ('angle' not in c) & ('height' not in c) & ('altittude' not in c)]
len(simple_feats)

38

In [7]:
nan_cols = [c for c in df.columns if c not in ["target_max","target","target_min","target_variance", "target_count"]]
nan_cols = df[nan_cols].columns[df[nan_cols].isnull().any()].tolist()

for col in nan_cols:
    
    while df[col].isnull().sum()>0:
     
        df[col].fillna(df[["Place_ID", col]].groupby(["Place_ID"]).shift(periods=0).fillna(method='ffill', limit=1).fillna(method='bfill', limit=1)[col], inplace=True) 

df.isnull().sum()[df.isnull().sum()>0]

target             16136
target_min         16136
target_max         16136
target_variance    16136
target_count       16136
dtype: int64

In [8]:
df['placeID_freq'] = df['Place_ID'].map(df['Place_ID'].value_counts())
"""
for i in range(1, 20):
    df[f'prev_target_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()
    df[f'next_target_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()
for i in tqdm_notebook(range(1, 25)):
    df[f'magic_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()
    df[f'magic2_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()
    """
for i in tqdm_notebook(range(1, 45)):
    c = 'Date' + str(i+1)
    df[c] = df['Date']  + datetime.timedelta(days=i)
    _ = add_cyclic_datepart(df, c, prefix = c)
    
for i in tqdm_notebook(range(1, 22)):
    tmp = df.sort_values(by='Date').groupby('Place_ID')[simple_feats].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()
    tmp_diff_prev = df[simple_feats] - tmp
    tmp.columns = [c + f'_prev_{i}' for c in tmp.columns]
    tmp_diff_prev.columns = [c + f'_prev_diff_{i}' for c in tmp_diff_prev.columns]
    df = pd.concat([df, tmp, tmp_diff_prev], axis=1)
    
    tmp = df.sort_values(by='Date').groupby('Place_ID')[simple_feats].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()
    tmp_diff_next = df[simple_feats] - tmp
    tmp.columns = [c + f'_next_{i}' for c in tmp.columns]
    tmp_diff_next.columns = [c + f'_next_diff_{i}' for c in tmp_diff_next.columns]
    df = pd.concat([df, tmp, tmp_diff_next], axis=1)
    
for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear']:
    df[attr] = getattr(df['Date'].dt, attr)
df['is_weekend'] = (df['dayofweek'] >= 5)*1
df['fortnight'] = df['day']%15
df['which_fortnight'] = df['day']//15

add_cyclic_datepart(df, "Date", prefix = "Current_Date_")

"\nfor i in range(1, 20):\n    df[f'prev_target_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(i).fillna(method='ffill').fillna(method='bfill').sort_index()\n    df[f'next_target_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(-i).fillna(method='bfill').fillna(method='ffill').sort_index()\nfor i in tqdm_notebook(range(1, 25)):\n    df[f'magic_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').fillna(method='bfill').sort_index()\n    df[f'magic2_{i}'] = df.sort_values(by='Date')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').fillna(method='ffill').sort_index()\n    "

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]



Unnamed: 0,Place_ID X Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,...,fortnight,which_fortnight,Current_Date_weekday_cos,Current_Date_weekday_sin,Current_Date_day_month_cos,Current_Date_day_month_sin,Current_Date_month_year_cos,Current_Date_month_year_sin,Current_Date_day_year_cos,Current_Date_day_year_sin
0,010Q650 X 2020-01-02,010Q650,38.0,23.0,53.0,769.50,92.0,11.000000,60.200001,0.00804,...,2,0,-0.900969,0.433884,0.979530,0.201299,1.000000e+00,0.000000,0.999853,0.017166
1,010Q650 X 2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91.0,14.600000,48.799999,0.00839,...,3,0,-0.900969,-0.433884,0.918958,0.394356,1.000000e+00,0.000000,0.999411,0.034328
2,010Q650 X 2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96.0,16.400000,33.400002,0.00750,...,4,0,-0.222521,-0.974928,0.820763,0.571268,1.000000e+00,0.000000,0.998674,0.051479
3,010Q650 X 2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96.0,6.911948,21.300001,0.00391,...,5,0,0.623490,-0.781831,0.688967,0.724793,1.000000e+00,0.000000,0.997643,0.068615
4,010Q650 X 2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95.0,13.900001,44.700001,0.00535,...,6,0,1.000000,0.000000,0.528964,0.848644,1.000000e+00,0.000000,0.996318,0.085731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46688,ZZDJZMS X 2020-03-31,ZZDJZMS,,,,,,9.400001,86.200005,0.00413,...,1,2,0.623490,0.781831,0.979530,-0.201299,5.000000e-01,0.866025,0.025748,0.999668
46689,ZZDJZMS X 2020-04-01,ZZDJZMS,,,,,,17.000000,90.300003,0.00548,...,1,0,-0.222521,0.974928,1.000000,0.000000,6.123234e-17,1.000000,0.008583,0.999963
46690,ZZDJZMS X 2020-04-02,ZZDJZMS,,,,,,19.170641,86.200005,0.00656,...,2,0,-0.900969,0.433884,0.978148,0.207912,6.123234e-17,1.000000,-0.008583,0.999963
46691,ZZDJZMS X 2020-04-03,ZZDJZMS,,,,,,8.400001,66.400002,0.00457,...,3,0,-0.900969,-0.433884,0.913545,0.406737,6.123234e-17,1.000000,-0.025748,0.999668


In [9]:
df.isnull().sum()[df.isnull().sum()>0]

target             16136
target_min         16136
target_max         16136
target_variance    16136
target_count       16136
dtype: int64

In [10]:
nan_cols = ['Date', 'target_count', 'target_min', 'Place_ID X Date', 'target_variance', 'Place_ID',
                                               'target_max', 'target',  'month_year_cos','month_year_sin','day_year_cos','day_year_sin']

features = [c for c in df.columns if c not in nan_cols]
train = df[:train.shape[0]].reset_index(drop=True)
test = df[train.shape[0]:].reset_index(drop=True)
target = train[TARGET_COL]

len(features)

3639

In [11]:
train = reduce_mem_usage(train)
print('************************* \n')
test = reduce_mem_usage(test)

Memory usage of dataframe is 849.59 MB
Memory usage after optimization is: 235.81 MB
Decreased by 72.2%
************************* 

Memory usage of dataframe is 448.64 MB
Memory usage after optimization is: 124.93 MB
Decreased by 72.2%


In [12]:
del df
import gc
gc.collect()

23

In [13]:
param = {'num_leaves': 100,
         'min_data_in_leaf': 40, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.35,
         "metric": 'auc',
         "lambda_l1": 1,
         "lambda_l2": 1,
         "random_state": 6,
         "verbosity": -1,
         'metric' : 'rmse',
         'num_iterations': 2500}

In [21]:
target_cols = ['target']
oofs_df = pd.DataFrame()
preds_df = pd.DataFrame()
for t_col in target_cols:
    oofs_df[t_col] = np.zeros(len(train))
    preds_df[t_col] = np.zeros(len(test))

In [None]:
max_iter = 5
folds = StratifiedKFold(n_splits=max_iter, random_state=None)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, pd.qcut(target, 10, labels=False, duplicates='drop'))):
    print("\nfold n°{}".format(fold_))
    X_trn, X_val, X_test = train.iloc[trn_idx][features], train.iloc[val_idx][features], test[features]
    for t_col in target_cols:
        target = train[t_col]
        print(f"\n\n**** {t_col} ****\n")
        y_trn, y_val = target.iloc[trn_idx], target.iloc[val_idx]
        trn_data = lgb.Dataset(X_trn, y_trn)
        val_data = lgb.Dataset(X_val, y_val)
        
        clf = lgb.train(param, trn_data, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 150)

        oofs_df[t_col][val_idx] = clf.predict(X_val, num_iteration=clf.best_iteration)
        current_test_pred = clf.predict(X_test, num_iteration=clf.best_iteration)
        current_test_pred[current_test_pred < 0] = 0
        preds_df[t_col] += current_test_pred / folds.n_splits
        break
    gc.collect()
    #break


fold n°0


**** target ****





Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 15.7947	valid_1's rmse: 32.2692
[200]	training's rmse: 10.1046	valid_1's rmse: 31.6905
[300]	training's rmse: 7.16896	valid_1's rmse: 31.462
[400]	training's rmse: 5.36385	valid_1's rmse: 31.3613
[500]	training's rmse: 4.14722	valid_1's rmse: 31.312
[600]	training's rmse: 3.29565	valid_1's rmse: 31.3066
[700]	training's rmse: 2.67335	valid_1's rmse: 31.2955
[800]	training's rmse: 2.21819	valid_1's rmse: 31.282
[900]	training's rmse: 1.87906	valid_1's rmse: 31.2757
[1000]	training's rmse: 1.62288	valid_1's rmse: 31.2788
Early stopping, best iteration is:
[917]	training's rmse: 1.83008	valid_1's rmse: 31.274


517


fold n°1


**** target ****

Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 16.7335	valid_1's rmse: 29.7155
[200]	training's rmse: 11.049	valid_1's rmse: 28.621
[300]	training's rmse: 7.79239	valid_1's rmse: 28.3439
[400]	training's rmse: 5.76316	valid_1's rmse: 28.2675
[500]	training's rmse: 4.35611	valid_1's rmse: 28.2363
[600]	training's rmse: 3.33095	valid_1's rmse: 28.2288
[700]	training's rmse: 2.56849	valid_1's rmse: 28.2318
Early stopping, best iteration is:
[635]	training's rmse: 3.04078	valid_1's rmse: 28.2152


132


fold n°2


**** target ****

Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 17.0959	valid_1's rmse: 28.209
[200]	training's rmse: 11.3945	valid_1's rmse: 27.2722
[300]	training's rmse: 8.16167	valid_1's rmse: 27.013
[400]	training's rmse: 6.10314	valid_1's rmse: 26.898
[500]	training's rmse: 4.65796	valid_1's rmse: 26.8482
[600]	training's rmse: 3.61795	valid_1's rmse: 26.8043
[700]	training's rmse: 2.8417	valid_1's rmse: 26.782
[800]	training's rmse: 2.25581	valid_1's rmse: 26.765
[900]	training's rmse: 1.81718	valid_1's rmse: 26.7464
[1000]	training's rmse: 1.47659	valid_1's rmse: 26.7364
[1100]	training's rmse: 1.21802	valid_1's rmse: 26.7271
[1200]	training's rmse: 1.01609	valid_1's rmse: 26.7222
[1300]	training's rmse: 0.857039	valid_1's rmse: 26.7193
[1400]	training's rmse: 0.731572	valid_1's rmse: 26.7175
[1500]	training's rmse: 0.633571	valid_1's rmse: 26.7161
[1600]	training's rmse: 0.555216	valid_1's rmse: 26.7136
[1700]	training's rmse:

46


fold n°3


**** target ****

Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 16.9936	valid_1's rmse: 29.68
[200]	training's rmse: 11.2674	valid_1's rmse: 28.6697
[300]	training's rmse: 8.00806	valid_1's rmse: 28.3356
[400]	training's rmse: 5.96138	valid_1's rmse: 28.2218
[500]	training's rmse: 4.54337	valid_1's rmse: 28.1448
[600]	training's rmse: 3.51923	valid_1's rmse: 28.1057
[700]	training's rmse: 2.7655	valid_1's rmse: 28.0912
[800]	training's rmse: 2.19461	valid_1's rmse: 28.0757


In [None]:
_ = plt.figure(figsize=(10, 10))
fi = pd.Series(index=features, data=clf.feature_importance())
_ = fi.sort_values()[-20:].plot(kind='barh')

In [None]:
rmse(target.values, oofs_df['target'].values)

In [None]:
predictions_test = current_test_pred
predictions_test[predictions_test < 0] = 0

In [None]:
SUB_FILE_NAME = 'preds_lgbm_v4.csv'
sub_df = pd.DataFrame()
sub_df[ID_COL] = test[ID_COL]
sub_df[TARGET_COL] = predictions_test
sub_df.to_csv(SUB_FILE_NAME, index=False)
sub_df.head(10)

In [None]:
sub_df[TARGET_COL].describe()

<a href='preds_lgbm_v4.csv'> submission </a>