# Preparation

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Dictionary
- row_id: a unique identifier for this instance
- time: the 20-minute period in which each measurement was taken
- x:  the east-west midpoint coordinate of the roadway
- y:  the north-south midpoint coordinate of the roadway
- direction:  the direction of travel of the roadway. 
    - EB indicates "eastbound" travel
    - SW indicates a "southwest" direction of travel.
- congestion: congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.

## Loading data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='row_id', parse_dates=['time'])
train_df

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id', parse_dates=['time'])
test_df

# Feature Engineering

In [None]:
for df in [train_df, test_df]:
    df['weekday'] = df.time.dt.weekday
    df['hour'] = df.time.dt.hour
    df['minute'] = df.time.dt.minute
    df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']

In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

cate_features = ['road']
le = LabelEncoder()

for feature in cate_features:
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])

In [None]:
train_df

In [None]:
# Compute the median congestion for every place and time of week
medians = train_df.groupby(['road', 'weekday', 'hour', 'minute']).congestion.median().astype(int)
medians

In [None]:
tst_start = pd.to_datetime('1991-09-23 12:00')
tst_finish = pd.to_datetime('1991-09-23 23:40')

X_train = train_df[train_df['time'] < tst_start]
y_train = X_train['congestion']
X_train = X_train.drop(['congestion', 'time', 'direction'], axis=1)

X_valid = train_df[(train_df['time'] >= tst_start) & (train_df['time'] <= tst_finish)]
y_valid = X_valid['congestion']
X_valid = X_valid.drop(['time', 'congestion', 'direction'], axis=1)

In [None]:
test_df

In [None]:
test_df = test_df.drop(['time', 'direction'], axis=1)

In [None]:
X_train

In [None]:
X_valid

In [None]:
y_train

## LightGBM

In [None]:
from sklearn.metrics import mean_absolute_error

def mae_valid(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    return(mae)

In [None]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    "objective" : "regression",
    "metric" : "mae",
    'learning_rate': 0.5,
    'num_leaves':100,
    'device':'gpu'
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid , y_valid, reference=lgb_train)

In [None]:
lgb_results = {}   
model = lgb.train(
    params = params,
    train_set = lgb_train,
    valid_sets = [lgb_eval ,lgb_train],
    valid_names=['eval', 'train'],
    num_boost_round = 100, 
    early_stopping_rounds=50,
    evals_result=lgb_results,
    verbose_eval=-1 
)

In [None]:
lgb_results

In [None]:
loss_train = lgb_results['train']['l1']
loss_test = lgb_results['eval']['l1']   

import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(111)
  
ax1.set_xlabel('Iteration')
ax1.set_ylabel('mae')
 
ax1.plot(loss_train, label='train loss')
ax1.plot(loss_test, label='test loss')
 
plt.legend()
plt.show()

In [None]:
lgb_prediction = model.predict(test_df)
lgb_prediction

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
#submission['congestion'] = (cat_prediction + lgb_prediction)/2
submission['congestion'] = lgb_prediction
submission['congestion'] = submission['congestion'].round().astype(int)
submission.to_csv('submission_LGBM.csv', index=False)

In [None]:
submission

# AutoML + LightGBM

In [None]:
!pip install flaml[notebook];

In [None]:
from flaml import AutoML

In [None]:
automl = AutoML()
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'regression',  # task type  
    "log_file_name": 'houses_experiment.log',  # flaml log file
    "seed": 7654321,    # random seed
}
automl.fit(X_train=X_train, y_train=y_train, **settings)

In [None]:
print('Best hyperparmeter config:', automl.best_config)
print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
print(automl.model.estimator)

In [None]:
import matplotlib.pyplot as plt
plt.barh(automl.model.estimator.feature_name_, automl.model.estimator.feature_importances_)

In [None]:
y_pred = automl.predict(test_df)
print('Predicted labels', y_pred)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion'] = y_pred
submission['congestion'] = submission['congestion'].round().astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
y_pred

In [None]:
submission