In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
!pip install -U lightautoml


In [None]:
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
N_THREADS = 4 
RANDOM_STATE = 21
TIMEOUT = 8 * 3600
TARGET_NAME = 'congestion'
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
INPUT_DIR = '../input/tabular-playground-series-mar-2022/'
train = pd.read_csv(INPUT_DIR + 'train.csv', dtype={'time': str})
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(INPUT_DIR + 'test.csv', dtype={'time': str})
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(sub.shape)
sub.head()

## Feature Engineering 

In [None]:
def feature_engineering(data):
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['hour+minute'] = data['time'].dt.hour * 60 + data['time'].dt.minute
    data['is_weekend'] = (data['time'].dt.dayofweek > 4).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['x+y'] = data['x'].astype('str') + data['y'].astype('str')
    data['x+y+direction'] = data['x'].astype('str') + data['y'].astype('str') + data['direction'].astype('str')
    data['hour+direction'] = data['hour'].astype('str') + data['direction'].astype('str')
    data['hour+x+y'] = data['hour'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+direction+x'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str')
    data['hour+direction+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['y'].astype('str')
    data['hour+direction+x+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+x'] = data['hour'].astype('str') + data['x'].astype('str')
    data['hour+y'] = data['hour'].astype('str') + data['y'].astype('str')
    data = data.drop(['time'], axis=1)
    return data

In [None]:
for data in [train, test]:
    data = feature_engineering(data)

In [None]:
train[:3]

In [None]:
test[:3]

# Modelling ⚙️

**Now we need to define the task the task which AutoML model need to solve**

In [None]:
task = Task('reg', metric='mae', loss='mae')

In [None]:
roles = {'target': TARGET_NAME,
         'drop': ['row_id']
         }

In [None]:
automl = TabularAutoML(task = task,
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'random_state': RANDOM_STATE},
                       general_params = {'use_algos': [['lgb']]}
                      )

In [None]:
_pred = automl.fit_predict(train, roles = roles, verbose=3)
print('oof_pred:\n{}\nShape = {}'.format(_pred, _pred.shape))

Get Faetures Importance of the Variables

In [None]:
feat = automl.get_feature_scores('fast')
feat.set_index('Feature')['Importance'].plot.bar(figsize=(15, 8), grid=True)

### Predicting for test data

In [None]:
test_pr = automl.predict(test)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pr, test_pr.shape))

# Time For Submission ⌚

In [None]:
OUTPUT_DIR = './'

In [None]:
sub[TARGET_NAME] = test_pr.data[:, 0]
sub.to_csv('lightautoml.csv', index=False)

sub=pd.read_csv(OUTPUT_DIR + 'lightautoml.csv')

sub['congestion'] = sub['congestion'].round().astype(int)
assert (sub['congestion'] >= 0).all()
assert (sub['congestion'] <= 100).all()
sub.to_csv('submission_52.csv', index=False)



In [None]:
sub=pd.read_csv(OUTPUT_DIR + 'submission-52.csv')

sub['congestion'] = sub['congestion'].round().astype(int)
assert (sub['congestion'] >= 0).all()
assert (sub['congestion'] <= 100).all()
sub.to_csv('submission.csv', index=False)
