In [None]:
import numpy as np 
import pandas as pd 
import math
import datetime

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import GradientBoostingRegressor


from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import LabelEncoder

from numpy import mean, median

from warnings import simplefilter
simplefilter("ignore")

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col="row_id", parse_dates=['time'])
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col="row_id", parse_dates=['time'])
df_train_idx = df_train.index
df_test_idx = df_test.index

In [None]:
df_train['road'] = df_train['x'].astype(str) + df_train['y'].astype(str) + df_train['direction']
df_test['road']  = df_test['x'].astype(str) + df_test['y'].astype(str) + df_test['direction']

le = LabelEncoder()
df_train['road'] = le.fit_transform(df_train['road'])
df_test['road']  = le.transform(df_test['road'])

In [None]:
minute = df_train.copy()
minute['time'] = minute['time'] + pd.Timedelta(20, unit="m")
minute = minute.rename(columns={'congestion':'lag'})[['time','direction','road', 'lag']]
df_train = df_train.merge(minute, on=['time','direction','road'], how='left')
df_test = df_test.merge(minute, on=['time','direction','road'], how='left')

In [None]:
df_train.head(100)

In [None]:
#df_train['time'][848834] - df_train['time'][0]

In [None]:
def add_datetime_features(df):
    df['month']   = df['time'].dt.month
    df['day']     = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['weekend'] = (df['time'].dt.weekday >= 5)
    df['hour']    = df['time'].dt.hour
    df['minute']  = df['time'].dt.minute
    df['afternoon'] = df['hour'] >= 12
    
    # combination of hour and minute features
    df['daytime_id'] = ( ( df.time.dt.hour*60 + df.time.dt.minute ) /20 ).astype(int)

In [None]:
add_datetime_features(df_train)
add_datetime_features(df_test)

In [None]:
median = df_train.groupby(['road', 'daytime_id']).congestion.median().astype(int)

In [None]:
df=df_train.copy()
df = df_train.merge(median, left_on=['road', 'daytime_id'], right_index=True)

In [None]:
df2 = df_test.copy()
df2 = df_test.merge(median,left_on=['road', 'daytime_id'], right_index=True)

In [None]:
medt = df2['congestion']
df_test['median'] = medt

In [None]:
df_test.head()

In [None]:
df.head()

In [None]:
med = df['congestion_y']
df_train['median'] = med
df_train['lag'] = df_train['lag'].fillna(df_train['median']) 
df_test['lag'] = df_test['lag'].fillna(df_test['median'])

In [None]:
df_train.head()

以下で、X_train,y_train,X_test,y_testも作りたい


In [None]:
tst_start = pd.to_datetime('1991-09-23 12:00')
tst_finish = pd.to_datetime('1991-09-23 23:40')

X_train = df_train[df_train['time'] < tst_start]
y_train = X_train['congestion']
X_train = X_train.drop(['congestion','direction','time'], axis=1)

X_valid = df_train[(df_train['time'] >= tst_start) & (df_train['time'] <= tst_finish)]
y_valid = X_valid['congestion']
X_valid = X_valid.drop(['congestion','direction','time'], axis=1)

In [None]:
X_train.head()

In [None]:
#　オリジナルバージョン
# X_train = df_train.copy()
# y_train = df_train['congestion']
# X_train = X_train.drop(['congestion','x','y','direction','time'],axis=1)

In [None]:
# from sklearn.feature_selection import mutual_info_regression

# mi_scores = mutual_info_regression(X_train, y_train)
# mi_scores = pd.Series(mi_scores, name="MI_score", index=X_train.columns)
# mi_scores = mi_scores.sort_values(ascending=False)
# df_mi_scores = pd.DataFrame(mi_scores).reset_index().rename(columns={'index':'feature'})
# df_mi_scores

In [None]:
X_train = X_train.drop(['month','day','weekday',"road","weekend",'hour','minute',"afternoon","daytime_id"], axis=1)
X_valid = X_valid.drop(['month','day','weekday',"road","weekend",'hour','minute',"afternoon","daytime_id"], axis=1)

In [None]:
X_train.head()

In [None]:
# y_train = df_train['congestion']
# train = df_train.drop(['time','direction','month','day','weekday','weekend','hour','minute','afternoon','road','daytime_id','congestion'],axis=1)
test = df_test.drop(['time','direction','month','day','weekday','weekend','hour','minute','afternoon','road','daytime_id'],axis=1)
test.head()

# Model

In [None]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    "objective" : "regression",
    "metric" : "mae",
    'learning_rate': 0.07,
    "num_iteration": 200,
    'num_leaves':200,
    'device':'gpu'
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid , y_valid, reference=lgb_train)

In [None]:
lgb_results = {}   
model = lgb.train(
    params = params,
    train_set = lgb_train,
    valid_sets = [lgb_eval ,lgb_train],
    valid_names=['eval', 'train'],
    num_boost_round = 100, 
    early_stopping_rounds=50,
    evals_result=lgb_results,
    verbose_eval=-1 
)

In [None]:
loss_train = lgb_results['train']['l1']
loss_test = lgb_results['eval']['l1']   

import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(111)
  
ax1.set_xlabel('Iteration')
ax1.set_ylabel('mae')
 
ax1.plot(loss_train, label='train loss')
ax1.plot(loss_test, label='test loss')
 
plt.legend()
plt.show()

In [None]:
lgb_prediction = model.predict(test)
lgb_prediction

In [None]:
#cat = CatBoostRegressor(logging_level='Silent', eval_metric='MAE', loss_function='MAE', random_state=42)
#ada = AdaBoostRegressor(n_estimators=500,random_state=0)
#bag = BaggingRegressor(n_estimators=500,random_state=0)
#hist = HistGradientBoostingRegressor(learning_rate=0.001)

#cat.fit(train,y_train)
#ada.fit(train,y_train)
#bag.fit(train,y_train)
#hist.fit(train,y_train)

#cat_pred = cat.predict(test)
#ada_pred = ada.predict(test)
#bad_pred = bag.predict(test)
#hist_pred = hist.predict(test)

# Submission

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv', index_col="row_id")

In [None]:
sample_submission

In [None]:
sample_submission['congestion'] = lgb_prediction.round().astype(int)
# sample_submission['cat'] = cat_pred
# sample_submission['ada'] = ada_pred
# sample_submission['bag'] = bad_pred
# sample_submission['hist'] = hist_pred
sample_submission

In [None]:
# sample_submission = sample_submission.drop(['lgb'],axis=1)
# sample_submission

In [None]:
# sample_submission['congestion'] = sample_submission.mean(axis=1)

In [None]:
# sample_submission = sample_submission.astype(int)
sample_submission.to_csv('submission.csv')#,index=False)

In [None]:
sample_submission

In [None]:

# sub = sample_submission.copy()
# sub = sub.drop(['lgb'],axis=1)

In [None]:
# df_test = df_test.set_index(df_test_idx)
# df_test['congestion'] = sample_submission['congestion']

In [None]:
# submission_in = sub.copy()

# sep = df_train[(df_train.time.dt.hour >= 12) & (df_train.time.dt.weekday < 5) &
#             (df_train.time.dt.dayofyear >= 246)]
# lower = sep.groupby(['hour', 'minute','x','y','direction']).congestion.quantile(0.15).values
# upper = sep.groupby(['hour', 'minute','y','x','direction']).congestion.quantile(0.7).values

In [None]:
# submission_out = submission_in.copy()
# submission_out['congestion'] = submission_in.congestion.clip(lower, upper)

In [None]:
# submission_out['congestion'] = submission_out['congestion'].astype(int)

In [None]:
# submission_out = submission_out.reset_index()

In [None]:
# submission_out.to_csv('submission.csv',index=False)