In [None]:
import pandas as pd

train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(len(train))
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
print(len(test))
test.head()

In [None]:
train['time'] = pd.to_datetime(train['time'])
train['hour'] = train['time'].dt.strftime('%H').astype('int')
train['minute'] = train['time'].dt.strftime('%M').astype('int')

test['time'] = pd.to_datetime(test['time'])
test['hour'] = test['time'].dt.strftime('%H').astype('int')
test['minute'] = test['time'].dt.strftime('%M').astype('int')

medians = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
stds = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.std().astype(int)).reset_index()
stds = stds.rename(columns={'congestion':'std'})
train=train.merge(medians,how='left',on=['x','y','direction','hour','minute'])
train=train.merge(stds,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(medians,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(stds,how='left',on=['x','y','direction','hour','minute'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
train[['median','std']] = mm.fit_transform(train[['median','std']])
test[['median','std']] = mm.transform(test[['median','std']])

train.head()

In [None]:
test.head()

In [None]:
data = pd.concat([train, test])
print(len(data))
data.head()

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
sub.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['road'] = data['x'].astype(str) + data['y'].astype(str) + data['direction']
data['road'] = le.fit_transform(data['road'])

data.head()

In [None]:
data['time'] = pd.to_datetime(data['time'])
data['weekday'] = data['time'].dt.weekday.astype('int')
data['month'] = data['time'].dt.strftime('%m').astype('int')
data['day'] = data['time'].dt.strftime('%d').astype('int')
data['hour'] = data['time'].dt.strftime('%H').astype('int')
data['minute'] = data['time'].dt.strftime('%M').astype('int')

data['moment']  = data['time'].dt.hour * 3 + data['time'].dt.minute // 20 
data['hour_minute'] = data['time'].dt.strftime('%H%M').astype('int')
data['is_afternoon'] = 0
data.loc[data['hour'] >= 12, 'is_afternoon'] = 1

#5/27, 7/4, 9/2は祝日
data['date'] = data['time'].dt.strftime('%y%m%d')
data['is_holiday'] = 0
data.loc[data['date']=='910527', 'is_holiday'] = 1
data.loc[data['date']=='910704', 'is_holiday'] = 1
data.loc[data['date']=='910902', 'is_holiday'] = 1

#土日
data['is_dayoff'] = 0
data.loc[data['weekday']==5, 'is_dayoff'] = 1
data.loc[data['weekday']==6, 'is_dayoff'] = 1

data.head()

In [None]:
data = data.drop('date', axis=1)
data = data.drop(['x', 'y', 'direction'], axis=1)

In [None]:
data = data.drop('row_id', axis=1)
data.head()

In [None]:
train = data[:len(train)]
test = data[len(train):]

In [None]:
df_mornings = train[(train.hour >= 6) & (train.hour < 12)]
morning_avgs = pd.DataFrame(df_mornings.groupby(['month', 'day', 'road']).congestion.median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={'congestion':'morning_avg'})
train = train.merge(morning_avgs, on=['month', 'day', 'road'], how='left')
test = test.merge(morning_avgs, on=['month', 'day', 'road'], how='left')

mins = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
train = train.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')

maxs = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
train = train.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')

train.head()

In [None]:
quantile25 = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.quantile([.25]).astype(int)).reset_index()
quantile25 = quantile25.rename(columns={'congestion':'quantile25'}).drop(['level_4'], axis=1)
train = train.merge(quantile25, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(quantile25, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
quantile75 = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.quantile([.75]).astype(int)).reset_index()
quantile75 = quantile75.rename(columns={'congestion':'quantile75'}).drop(['level_4'], axis=1)
train = train.merge(quantile75, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(quantile75, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
important_features = ['moment', 'median', 'min', 'max', 'morning_avg']

X = train.copy()
X_t = test.copy()

y = X.pop('congestion')
X = X.loc[:, important_features]
X_t = X_t.loc[:, important_features]

from sklearn.decomposition import PCA

# Create principal components
pca = PCA(n_components=2) # 5 +0.012 public score
X_pca = pca.fit_transform(X)
X_t_pca = pca.transform(X_t)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
X_t_pca = pd.DataFrame(X_t_pca, columns=component_names)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, X_t_pca], axis=1)

train.head()

In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))
def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

train['moment_sin'] = sin_transformer(72).fit_transform(train["moment"])
train['moment_cos'] = cos_transformer(72).fit_transform(train["moment"])
test['moment_sin'] = sin_transformer(72).fit_transform(test["moment"])
test['moment_cos'] = cos_transformer(72).fit_transform(test["moment"])

In [None]:
train = train.drop('hour', axis=1)
train = train.drop('minute', axis=1)
test = test.drop('hour', axis=1)
test = test.drop('minute', axis=1)
test = test.drop('congestion', axis=1)

In [None]:
day = train.copy()
day['time'] = day['time'] + pd.Timedelta(1, unit="d")
day = day.rename(columns={'congestion':'lag1'})[['time', 'road', 'lag1']]
train = train.merge(day, on=['time', 'road'], how='left')
test = test.merge(day, on=['time', 'road'], how='left')

week = train.copy()
week['time'] = week['time'] + pd.Timedelta(7, unit="d")
week = week.rename(columns={'congestion':'lag7'})[['time', 'road', 'lag7']]
train = train.merge(week, on=['time', 'road'], how='left')
test = test.merge(week, on=['time', 'road'], how='left')

train['lag1'] = train['lag1'].fillna(train['median']) #(train['median'])
train['lag7'] = train['lag7'].fillna(train['median']) #(train['median'])

train = train.drop('time', axis=1)
test = test.drop('time', axis=1)

In [None]:
train.to_csv('train_lgb.csv', index=False)
test.to_csv('test_lgb.csv', index=False)
train.head()

In [None]:
#test_lgb = test_lgb.drop('congestion', axis=1)

# モデルの作成

In [None]:
!pip install -U lightautoml

In [None]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
N_THREADS = 4 
RANDOM_STATE = 500
TIMEOUT = 5 * 3600
TARGET_NAME = 'congestion'

In [None]:
import numpy as np
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
task = Task('reg', metric='mae', loss='mae')

In [None]:
roles = {'target': TARGET_NAME}

In [None]:
automl = TabularAutoML(task = task,
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'random_state': RANDOM_STATE},
                       general_params = {'use_algos': [['lgb']]}
                      )

In [None]:
oof_pred = automl.fit_predict(train, roles = roles, verbose=1)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize=(20, 10), grid=True)

In [None]:
test_pred = automl.predict(test)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

In [None]:
sub[TARGET_NAME] = test_pred.data[:, 0].round()

In [None]:
sub.to_csv('submission.csv', index=False)
sub