In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
def feature_engineering(data):
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['is_weekend'] = (data['time'].dt.dayofweek > 5).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['road'] = data['x'].astype(str) + data['y'].astype(str) + data['direction']
    
    data['moment']  = data['time'].dt.hour * 3 + data['time'].dt.minute // 20 
    
    data = data.drop(['row_id', 'direction'], axis=1)
    
    return data

In [None]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [None]:
mins = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
train_df = train_df.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
maxs = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
train_df = train_df.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
medians = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
train_df = train_df.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
pd.get_option('display.max_columns')
pd.set_option('display.max_columns', 20)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

cate_features = ['road']
le = LabelEncoder()

for feature in cate_features:
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])

In [None]:
tst_start = pd.to_datetime('1991-09-23 12:00')
tst_finish = pd.to_datetime('1991-09-23 23:40')

X_train = train_df[train_df['time'] < tst_start]
y_train = X_train['congestion']
X_train = X_train.drop(['congestion', 'time'], axis=1)

X_valid = train_df[(train_df['time'] >= tst_start) & (train_df['time'] <= tst_finish)]
y_valid = X_valid['congestion']
X_valid = X_valid.drop(['time', 'congestion'], axis=1)

In [None]:
X_train

In [None]:
y_train

In [None]:
from sklearn.metrics import mean_absolute_error

def mae_valid(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    return(mae)

**Catboost**

In [None]:
from catboost import CatBoostRegressor
model_cat = CatBoostRegressor(logging_level='Silent', depth=8,
                              eval_metric='MAE', loss_function='MAE', n_estimators=800)

In [None]:
# y_train = train_df['congestion']
# train_df = train_df.drop(['congestion', 'time'], axis=1)
test_df = test_df.drop('time', axis=1)

In [None]:
train_df

In [None]:
test_df

In [None]:
model_cat.fit(X_train, y_train)
cat_prediction = model_cat.predict(test_df)

**lightGBM**

In [None]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    "objective" : "regression",
    "metric" : "mae",
    'learning_rate': 0.5,
    'num_leaves':100,
    'device':'gpu'
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid , y_valid, reference=lgb_train)



In [None]:
X_train

In [None]:
y_train

In [None]:
lgb_results = {}   
model = lgb.train(
    params = params,
    train_set = lgb_train,
    valid_sets = [lgb_eval ,lgb_train],
     valid_names=['eval', 'train'],
    num_boost_round = 100, 
    early_stopping_rounds=50,
     evals_result=lgb_results,
     verbose_eval=-1 
)


In [None]:
lgb_results

In [None]:
loss_train = lgb_results['train']['l1']
loss_test = lgb_results['eval']['l1']   

import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(111)
  
ax1.set_xlabel('Iteration')
ax1.set_ylabel('mae')
 
ax1.plot(loss_train, label='train loss')
ax1.plot(loss_test, label='test loss')
 
plt.legend()
plt.show()

In [None]:
lgb_prediction = model.predict(test_df)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion'] = (cat_prediction + lgb_prediction)/2
submission['congestion'] = submission['congestion'].round().astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
submission