In [None]:
import pandas as pd

train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(len(train))
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
print(len(test))
test.head()

In [None]:
train['time'] = pd.to_datetime(train['time'])
train['hour'] = train['time'].dt.strftime('%H').astype('int')
train['minute'] = train['time'].dt.strftime('%M').astype('int')

test['time'] = pd.to_datetime(test['time'])
test['hour'] = test['time'].dt.strftime('%H').astype('int')
test['minute'] = test['time'].dt.strftime('%M').astype('int')

medians = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
stds = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.std().astype(int)).reset_index()
stds = stds.rename(columns={'congestion':'std'})
train=train.merge(medians,how='left',on=['x','y','direction','hour','minute'])
train=train.merge(stds,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(medians,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(stds,how='left',on=['x','y','direction','hour','minute'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
train[['median','std']] = mm.fit_transform(train[['median','std']])
test[['median','std']] = mm.transform(test[['median','std']])

train.head()

In [None]:
test.head()

In [None]:
data = pd.concat([train, test])
print(len(data))
data.head()

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
sub.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['road'] = data['x'].astype(str) + data['y'].astype(str) + data['direction']
data['road'] = le.fit_transform(data['road'])

data.head()

In [None]:
data['time'] = pd.to_datetime(data['time'])
data['weekday'] = data['time'].dt.weekday.astype('int')
data['month'] = data['time'].dt.strftime('%m').astype('int')
data['day'] = data['time'].dt.strftime('%d').astype('int')
data['hour'] = data['time'].dt.strftime('%H').astype('int')
data['minute'] = data['time'].dt.strftime('%M').astype('int')

data['moment']  = data['time'].dt.hour * 3 + data['time'].dt.minute // 20 
data['hour_minute'] = data['time'].dt.strftime('%H%M').astype('int')
data['is_afternoon'] = 0
data.loc[data['hour'] >= 12, 'is_afternoon'] = 1

#5/27, 7/4, 9/2は祝日
data['date'] = data['time'].dt.strftime('%y%m%d')
data['is_holiday'] = 0
data.loc[data['date']=='910527', 'is_holiday'] = 1
data.loc[data['date']=='910704', 'is_holiday'] = 1
data.loc[data['date']=='910902', 'is_holiday'] = 1

#土日
data['is_dayoff'] = 0
data.loc[data['weekday']==5, 'is_dayoff'] = 1
data.loc[data['weekday']==6, 'is_dayoff'] = 1

data= data.drop('date', axis=1)
data = data.drop(['x', 'y', 'direction'], axis=1)

data.head()

In [None]:
data = data.drop('row_id', axis=1)
data.head()

In [None]:
train = data[:len(train)]
test = data[len(train):]

In [None]:
df_mornings = train[(train.hour >= 6) & (train.hour < 12)]
morning_avgs = pd.DataFrame(df_mornings.groupby(['month', 'day', 'road']).congestion.median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={'congestion':'morning_avg'})
train = train.merge(morning_avgs, on=['month', 'day', 'road'], how='left')
test = test.merge(morning_avgs, on=['month', 'day', 'road'], how='left')

mins = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
train = train.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')

maxs = pd.DataFrame(train.groupby(['road', 'weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
train = train.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')

train = train.drop('hour', axis=1)
train = train.drop('minute', axis=1)
test = test.drop('hour', axis=1)
test = test.drop('minute', axis=1)
test = test.drop('congestion', axis=1)

train.head()

In [None]:
important_features = ['moment', 'median', 'min', 'max', 'morning_avg']

X = train.copy()
X_t = test.copy()

y = X.pop('congestion')
X = X.loc[:, important_features]
X_t = X_t.loc[:, important_features]

from sklearn.decomposition import PCA

# Create principal components
pca = PCA(n_components=2) # 5 +0.012 public score
X_pca = pca.fit_transform(X)
X_t_pca = pca.transform(X_t)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
X_t_pca = pd.DataFrame(X_t_pca, columns=component_names)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, X_t_pca], axis=1)

train.head()

In [None]:
day = train.copy()
day['time'] = day['time'] + pd.Timedelta(1, unit="d")
day = day.rename(columns={'congestion':'lag1'})[['time', 'road', 'lag1']]
train = train.merge(day, on=['time', 'road'], how='left')
test = test.merge(day, on=['time', 'road'], how='left')

week = train.copy()
week['time'] = week['time'] + pd.Timedelta(7, unit="d")
week = week.rename(columns={'congestion':'lag7'})[['time', 'road', 'lag7']]
train = train.merge(week, on=['time', 'road'], how='left')
test = test.merge(week, on=['time', 'road'], how='left')

train['lag1'] = train['lag1'].fillna(train['median']) #(train['median'])
train['lag7'] = train['lag7'].fillna(train['median']) #(train['median'])

train = train.drop('time', axis=1)
test = test.drop('time', axis=1)

In [None]:
train.to_csv('train_lgb.csv', index=False)
test.to_csv('test_lgb.csv', index=False)
train_y = train.pop('congestion')
train.head()

# モデルの作成

In [None]:
train_mode = True

In [None]:
if train_mode:
    import lightgbm as lgb
    from catboost import CatBoostRegressor
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_absolute_error
    import numpy as np
    import pickle

    num_round = 15000

    #ハイパーパラメータの設定
    params = {'objective': 'regression', 
              'learning_rate': 0.05,
              'seed': 42,  
              'verbose': -1,
              'metrics': 'mae', 
              'device' : 'cpu'}

    param_cat = {
            'iterations': 10000,
            'use_best_model': True,
            'loss_function' : 'MAE',
            'task_type' : 'CPU', 
            'grow_policy' : 'SymmetricTree',
            'learning_rate': 0.01,
            'l2_leaf_reg' : 0.2,
            'random_state': 0
         }

    valid_scores_lgb = []
    valid_scores_cat = []
    models_1 = []
    models_2 = []
    fold_num = 10

    categorical_features = ['is_afternoon', 'is_holiday', 'is_dayoff']

    kf = KFold(n_splits=fold_num, shuffle=True, random_state=42)

    for fold, (train_indices, valid_indices) in enumerate(kf.split(train)):
        X_train, X_valid = train.iloc[train_indices], train.iloc[valid_indices]
        y_train, y_valid = train_y.iloc[train_indices], train_y.iloc[valid_indices]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid)
        
        model_lgb = lgb.train(
            params,
            lgb_train,
            valid_sets=lgb_eval,
            num_boost_round=num_round,
            verbose_eval='mae', 
            early_stopping_rounds=100 # 100回ごとに検証精度の改善を検討　→ 精度が改善しないなら学習を終了(過学習に陥るのを防ぐ)
        )
        

        model_ct = CatBoostRegressor(verbose=False, eval_metric='MAE', **param_cat)
        model_ct.fit(X_train, y_train, eval_set=(X_valid,y_valid), cat_features= categorical_features)
        
        file_1 = 'trained_model1_'+str(fold)+'.pkl'
        pickle.dump(model_lgb, open(file_1, 'wb'))
        y_valid_pred = model_lgb.predict(X_valid)
        score = mean_absolute_error(y_valid, y_valid_pred)
        print(f'lightgbm fold {fold} MAE: {score}')
        valid_scores_lgb.append(score)
        
        
        file_2 = 'trained_model2_'+str(fold)+'.pkl'
        pickle.dump(model_ct, open(file_2, 'wb'))
        y_valid_pred = model_ct.predict(X_valid)
        score = mean_absolute_error(y_valid, y_valid_pred)
        print(f'catboost fold {fold} MAE: {score}')
        valid_scores_cat.append(score)
        
        models_1.append(model_lgb)
        models_2.append(model_ct)

    cv_score_lgb = np.mean(valid_scores_lgb)
    print(f'lgb CV score: {cv_score_lgb}')

    cv_score_cat = np.mean(valid_scores_cat)
    print(f'cat CV score: {cv_score_cat}')

In [None]:
if not train_mode:
    import pickle
    models_1 = []
    fold_num = 5
    for i in range(fold_num):
        model_1 = pickle.load(open('../input/trainedmodels7/trained_model1_'+str(i)+'.pkl', 'rb'))
        models_1.append(model_1)

In [None]:
import numpy as np
preds = []
for model_1 in models_1:
    pred = model_1.predict(test)
    preds.append(pred)
model_pred_1 = np.mean(preds, axis=0)
model_pred_1

In [None]:
if not train_mode:
    import pickle
    models_2 = []
    fold_num = 5

    for i in range(fold_num):
        model_2 = pickle.load(open('../input/tabular-playground-series-mar-2022-catboost-3/trained_model2_'+str(i)+'.pkl', 'rb'))
        model_2 = pickle.load(open('../input/trainedmodels7/trained_model2_'+str(i)+'.pkl', 'rb'))
        models_2.append(model_2)

In [None]:
preds = []
for model_2 in models_2:
    pred = model_2.predict(test)
    preds.append(pred)
model_pred_2 = np.mean(preds, axis=0)
model_pred_2

In [None]:
sub['congestion'] = (0.5*model_pred_1 + 0.5*model_pred_2).astype('int')
sub.to_csv('submission.csv', index=False)