In [None]:
# === for *** ===#
import os
import numpy as np
import numpy as np
import pandas as pd

# === for graph === #
import seaborn as sns
import matplotlib.pyplot as plt
import plotly

%matplotlib inline
%conda info -e

# read data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")

In [None]:
df_train.info

In [None]:
df_train.head()

In [None]:
df_test.head()

# Preprocessing

## add new val

In [None]:
df_train['time'] = pd.to_datetime(df_train['time'])
df_train['month'] = df_train['time'].dt.month
df_train['weekday'] = df_train['time'].dt.weekday
df_train['hour'] = df_train['time'].dt.hour
df_train['minute'] = df_train['time'].dt.minute
df_train['is_month_start'] = df_train['time'].dt.is_month_start.astype('int')
df_train['is_month_end'] = df_train['time'].dt.is_month_end.astype('int')
df_train['is_weekend'] = (df_train['time'].dt.dayofweek > 5).astype('int')
df_train['is_afternoon'] = (df_train['time'].dt.hour > 12).astype('int')
df_train['road'] = df_train['x'].astype(str) + df_train['y'].astype(str) + df_train['direction']
df_train['moment']  = df_train['time'].dt.hour * 3 + df_train['time'].dt.minute // 20 

df_train.head()

In [None]:
df_test['time'] = pd.to_datetime(df_test['time'])
df_test['month'] = df_test['time'].dt.month
df_test['weekday'] = df_test['time'].dt.weekday
df_test['hour'] = df_test['time'].dt.hour
df_test['minute'] = df_test['time'].dt.minute
df_test['is_month_start'] = df_test['time'].dt.is_month_start.astype('int')
df_test['is_month_end'] = df_test['time'].dt.is_month_end.astype('int')
df_test['is_weekend'] = (df_test['time'].dt.dayofweek > 5).astype('int')
df_test['is_afternoon'] = (df_test['time'].dt.hour > 12).astype('int')
df_test['road'] = df_test['x'].astype(str) + df_test['y'].astype(str) + df_test['direction']
df_test['moment']  = df_test['time'].dt.hour * 3 + df_test['time'].dt.minute // 20 

df_test.head()

In [None]:
mins = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
df_train = df_train.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
df_test = df_test.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
maxs = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
df_train = df_train.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
df_test = df_test.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
medians = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
df_train = df_train.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')
df_test = df_test.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
df_train.info

In [None]:
df_test.info

In [None]:
df_train.info

In [None]:
##############################
## add new val of road_encoded(int) to road(str)
## road --> road_encoded
##############################

from sklearn.preprocessing import LabelEncoder

lis_str2int = ['road']
le = LabelEncoder()

for str2int in lis_str2int:
    le.fit(df_train[str2int])
    df_train["{}_encoded".format(str2int)] = le.transform(df_train[str2int])
    df_test["{}_encoded".format(str2int)] = le.transform(df_test[str2int])

df_train["road_encoded"]

## make x,y data for train

In [None]:
df_train.columns

In [None]:
lis_colum_for_train = [i for i in df_train.columns if not(i=="row_id" or i=="time" or i=="direction" or i=="congestion" or i=="road")]
lis_colum_for_train

In [None]:
x_train_ = df_train[lis_colum_for_train].values
x_train_.shape

In [None]:
y_train_ = df_train["congestion"].values
y_train_.shape

In [None]:
x_test = df_test[lis_colum_for_train].values
x_test.shape

In [None]:
############################################################
## split all train data to train and valid data
## len(valid) = len(alltrain)*0.2
############################################################

from sklearn.model_selection import train_test_split 
x_train, x_valid, y_train, y_valid = train_test_split(x_train_, y_train_, test_size=0.2, random_state=2)

# AImodel

## light GBM

In [None]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    "objective" : "regression",
    "metric" : "mae",
    'learning_rate': 0.5,
    'num_leaves':100,
#     'device':'gpu'
}

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_valid , y_valid, reference=lgb_train)

lgb_results = {}   
model = lgb.train(
    params = params,
    train_set = lgb_train,
    valid_sets = [lgb_eval ,lgb_train],
    valid_names=['eval', 'train'],
    num_boost_round = 100, 
    early_stopping_rounds=50,
    evals_result=lgb_results,
    verbose_eval=-1 )

In [None]:
lgb_results.keys()

In [None]:
loss_train = lgb_results['train']['l1']
loss_test = lgb_results['eval']['l1']   

import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(111)
  
ax1.set_xlabel('Iteration')
ax1.set_ylabel('mae')
 
ax1.plot(loss_train, label='train loss')
ax1.plot(loss_test, label='test loss')
 
plt.legend()
plt.show()

In [None]:
lgb_prediction = model.predict(x_test)

In [None]:
df_for_submit = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
df_for_submit["congestion"] = lgb_prediction
# df_for_submit.to_csv('./Output/submission.csv', index=False)

In [None]:
df_for_submit.info