In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
# from sklearn.ensemble import GradientBoostingRegressor
import lightgbm
# from lightgbm import LGBMRegressor 

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')

In [None]:
train

In [None]:
test

In [None]:
df_train = train.copy()
df_train = df_train.drop('row_id',1)
df_test = test.copy()
df_test = df_test.drop('row_id',1)

In [None]:
df_train['time'] = pd.to_datetime(df_train['time'], format='%Y-%m')
# df_train = df_train.set_index('time')
df_train

In [None]:
df_test['time'] = pd.to_datetime(df_test['time'], format='%Y-%m')
# df_test = df_test.set_index('time')
df_test

In [None]:
# df_train['time_step'] = np.arange(len(df_train.index))
# df_train

In [None]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [None]:
details(df_train)

In [None]:
details(df_test)

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(df_train['congestion'])
plt.title('congestion')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df_train['congestion'])

In [None]:
df_train.describe(percentiles=(.01,.05,.25,.5,.75,.9,.95,.99))

In [None]:
df_train['direction'].value_counts()

In [None]:
df_train['x'].value_counts()

In [None]:
df_train['y'].value_counts()

In [None]:
df_train.groupby(['x','y','direction']).mean()

In [None]:
df_train['weekday_no'] = df_train['time'].dt.dayofweek
df_train['week_day'] = df_train['time'].dt.day_name()
df_train['day_no'] = df_train['time'].dt.day
df_train['hr_day'] = df_train['time'].dt.hour
df_train['min_day'] = df_train['time'].dt.minute
df_train['month_name'] = df_train['time'].dt.month_name()
df_train['month'] = df_train['time'].dt.month
df_train['year_dayno'] = df_train['time'].dt.dayofyear
df_train

In [None]:
df_test['weekday_no'] = df_test['time'].dt.dayofweek
df_test['week_day'] = df_test['time'].dt.day_name()
df_test['day_no'] = df_test['time'].dt.day
df_test['hr_day'] = df_test['time'].dt.hour
df_test['min_day'] = df_test['time'].dt.minute
df_test['month_name'] = df_test['time'].dt.month_name()
df_test['month'] = df_test['time'].dt.month
df_test['year_dayno'] = df_test['time'].dt.dayofyear
df_test

In [None]:
df_train['weekday_no'].value_counts()

In [None]:
df_train['week_day'].value_counts()

In [None]:
df_train['day_no'].value_counts()

In [None]:
df_train['hr_day'].value_counts()

In [None]:
df_train['min_day'].value_counts()

In [None]:
df_train['month_name'].value_counts()

In [None]:
df_train['month'].value_counts()

In [None]:
df_train['year_dayno'].value_counts()

In [None]:
df_train

In [None]:
X = df_train.drop(['time','congestion','weekday_no','month'],1)
y = df_train['congestion']
X_test = df_test.drop(['time','weekday_no','month'],1)

In [None]:
ohe = OneHotEncoder(sparse=False)
ohe_df1 = pd.DataFrame(ohe.fit_transform(X[['direction','week_day','month_name']]),columns=ohe.get_feature_names())
ohe_df2 = pd.DataFrame(ohe.transform(X_test[['direction','week_day','month_name']]),columns=ohe.get_feature_names())

In [None]:
X.drop(['direction','week_day','month_name'],1,inplace=True)
X = pd.concat([X, ohe_df1],1)
X_test.drop(['direction','week_day','month_name'],1,inplace=True)
X_test = pd.concat([X_test, ohe_df2],1)

In [None]:
X

In [None]:
X.columns

In [None]:
X_test

In [None]:
scaler = StandardScaler()
X[['x', 'y', 'day_no', 'hr_day', 'min_day', 'year_dayno']] = scaler.fit_transform(X[['x', 'y', 'day_no', 'hr_day', 'min_day', 'year_dayno']])
X_test[['x', 'y', 'day_no', 'hr_day', 'min_day', 'year_dayno']] = scaler.transform(X_test[['x', 'y', 'day_no', 'hr_day', 'min_day', 'year_dayno']])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=23)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
model = list()
resample  = list()
r2score_train = list()
r2score_valid = list()
error_train = list()
error_valid = list()

In [None]:
# neg_mean_absolute_error

In [None]:
model_LR = LinearRegression()
model_DT = DecisionTreeRegressor(random_state = 23)
model_RF = RandomForestRegressor(random_state=23) #oob_score = True, bootstrap = True,
model_LGBM = lightgbm.LGBMRegressor(objective = 'regression', random_state=23)
# model_XGB = XGBRegressor(use_rmm=True, random_state=23)
print(model_LR)
print(model_DT)
print(model_RF)
print(model_LGBM)
# print(model_XGB)

In [None]:
params_DT = {
    'max_depth': [5, 10, 20, 50, 100, 200],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'min_samples_split' : [5, 10, 20, 50, 100, 200]
} 
# 'min_samples_split': 100, 'min_samples_leaf': 10, 'max_depth': 20
# 'min_samples_split': 50, 'min_samples_leaf': 5, 'max_depth': 200


params_RF = {    
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, 50, 100, 200],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'min_samples_split' : [5, 10, 20, 50, 100, 200],
    'oob_score': ['True'],
    'bootstrap': ['True'],
    'max_samples': [0.4,0.8]    
}

# params_GBM = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 20, 50, 100, 200],
#     'min_samples_leaf': [5, 10, 20, 50, 100, 200],
#     'min_samples_split' : [5, 10, 20, 50, 100, 200],
#     'subsample': [0.3, 0.7],
#     'learning_rate': [0.01, 0.1]    
# }

#RF
#'n_estimators': 50, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_depth': 200
# 'n_estimators': 50, 'min_samples_split': 100, 'min_samples_leaf': 5, 'max_depth': 100
# 'oob_score': 'True', 'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 50, 'max_samples': 0.8, 'max_depth': 100, 'bootstrap': 'True'

# params_XGB = {
#     'n_estimators': [50], #[5, 10, 20, 50, 100, 200, 500],
#     'sampling_method': ['gradient_based'],
#     'tree_method': ['gpu_hist'],
#     'max_depth': [2, 5, 10, 15, 20, 30],  
#     'min_child_weight': [1, 2, 5],
# #     'early_stopping_rounds': [5],
# #     'subsample': [0.5,1],
#     'learning_rate': [0.01]
# #     alpha
# }

params_LGBM = {
    'n_estimators': [50, 100, 200],
    'boosting_type': ['gbdt','dart'],
    'max_depth': [5, 10, 20, 50, 100, 200], 
    'min_child_samples': [5, 10, 20, 50, 100, 200],
#     'early_stopping_round': [5],
    'subsample': [0.2,0.5,0.7,1.0],        
    'learning_rate': [0.01, 0.1]
#     'lambda'
}
# 'subsample': 0.2, 'n_estimators': 200, 'min_child_samples': 50, 'max_depth': 20, 'learning_rate': 0.1, 'early_stopping': 5, 'boosting_type': 'dart'

In [None]:
model_LGBM.get_params().keys()

In [None]:
def model_fit_evaluation1(model_model, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    lr = model_model.fit(X_train, y_train)

    # Train set prediction
    y_train_pred = lr.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    #Mean Absolute error - train
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test set prediction    
    y_val_pred = lr.predict(X_valid)
    r2_valid = r2_score(y_valid, y_val_pred)
    
    #Mean Absolute error - val
    mae_val = mean_absolute_error(y_val, y_val_pred)
    
    print('R-Squared_train')
    print('='*60)
    print(r2_train,"\n")
    print('R-Squared_valid')
    print('='*60)
    print(r2_valid,"\n")    
    print('Mean absolute error - train')
    print('='*60)
    print(mae_train,"\n")
    print('Mean absolute error - val')
    print('='*60)
    print(mae_val,"\n")    
    
    model.append(algo)
    resample.append(sampling)
    r2score_train.append(r2_train)
    r2score_valid.append(r2_valid)
    error_train.append(mae_train) 
    error_valid.append(mae_val) 

In [None]:
def model_fit_evaluation2(model_model, params, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
#     cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=23)
    rc = RandomizedSearchCV(model_model, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1, random_state=23)
    rcv = rc.fit(X_train, y_train)
    # rcv = rc.best_estimator_
    
    print('\n')
    print('get params: ', rcv.get_params())
    print('best estimator : ', rcv.best_estimator_)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

        # Train set prediction
    y_train_pred = (rcv.best_estimator_).predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    #Mean Absolute error - train
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test set prediction    
    y_val_pred = (rcv.best_estimator_).predict(X_valid)
    r2_valid = r2_score(y_valid, y_val_pred)
    
    #Mean Absolute error - val
    mae_val = mean_absolute_error(y_val, y_val_pred)
    
    print('R-Squared_train')
    print('='*60)
    print(r2_train,"\n")
    print('R-Squared_valid')
    print('='*60)
    print(r2_valid,"\n")    
    print('Mean absolute error - train')
    print('='*60)
    print(mae_train,"\n")
    print('Mean absolute error - val')
    print('='*60)
    print(mae_val,"\n")  
    
    model.append(algo)
    resample.append(sampling)
    r2score_train.append(r2_train)
    r2score_valid.append(r2_valid)
    error_train.append(mae_train) 
    error_valid.append(mae_val) 

In [None]:
model_fit_evaluation1(model_LR, X_train, y_train, X_val, y_val, 'Linear Regression', 'without HPT')

In [None]:
%%time
model_fit_evaluation1(model_DT, X_train, y_train, X_val, y_val, 'Decision Tree', 'without HPT')

In [None]:
model_fit_evaluation2(model_DT, params_DT, X_train, y_train, X_val, y_val, 'Decision Tree', 'with HPT')

In [None]:
%%time
model_fit_evaluation1(model_RF, X_train, y_train, X_val, y_val, 'Random Forest', 'without HPT')

In [None]:
model_fit_evaluation2(model_RF, params_RF, X_train, y_train, X_val, y_val, 'Random Forest', 'with HPT')

In [None]:
%%time
model_fit_evaluation1(model_LGBM, X_train, y_train, X_val, y_val, 'LGBM', 'without HPT')

In [None]:
model_fit_evaluation2(model_LGBM, params_LGBM, X_train, y_train, X_val, y_val, 'LGBM', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation1(model_XGB, X_train, y_train, X_val, y_val, 'XGB Regressor', 'without HPT')

In [None]:
# model_fit_evaluation2(model_XGB, params_XGB, X_train, y_train, X_val, y_val, 'XGB Regressor', 'with HPT')

In [None]:
# DT: 0, 7, 5.5, 6.3
# RF: 2.1, 5.8, 6.1, 6.2


In [None]:
eval_df = pd.DataFrame({'model': model, 'resample': resample, 'r2score_train': r2score_train, 'r2score_valid': r2score_valid, 'error_train': error_train, 'error_valid': error_valid})
eval_df

In [None]:
X.shape, X_test.shape

In [None]:
rc1 = RandomizedSearchCV(model_RF, params_RF, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1, random_state=23)
rcv1 = rc1.fit(X, y)
# rcv = rc.best_estimator_

print('\n')
print('get params: ', rcv1.get_params())
print('best estimator : ', rcv1.best_estimator_)
print('best parameters: ', rcv1.best_params_)
print('best score: ', rcv1.best_score_)
print('\n')

    # Train set prediction
y_train_pred1 = (rcv1.best_estimator_).predict(X)
r2_train1 = r2_score(y, y_train_pred1)

#Mean Absolute error - train
mae_train1 = mean_absolute_error(y, y_train_pred1)

# Test set prediction    
y_test_pred1 = (rcv1.best_estimator_).predict(X_test)
# r2_test = r2_score(y, y_test_pred)

#Mean Absolute error - test
# mae_val = mean_absolute_error(y, y_test_pred)

print('R-Squared_train')
print('='*60)
print(r2_train1,"\n")   
print('Mean absolute error - train')
print('='*60)
print(mae_train1,"\n") 

In [None]:
# rc2 = RandomizedSearchCV(model_LGBM, params_LGBM, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1, random_state=23)
# rcv2 = rc2.fit(X, y)
# # rcv = rc.best_estimator_

# print('\n')
# print('get params: ', rcv2.get_params())
# print('best estimator : ', rcv2.best_estimator_)
# print('best parameters: ', rcv2.best_params_)
# print('best score: ', rcv2.best_score_)
# print('\n')

#     # Train set prediction
# y_train_pred2 = (rcv2.best_estimator_).predict(X)
# r2_train2 = r2_score(y, y_train_pred2)

# #Mean Absolute error - train
# mae_train2 = mean_absolute_error(y, y_train_pred2)

# # Test set prediction    
# y_test_pred2 = (rcv2.best_estimator_).predict(X_test)
# # r2_test = r2_score(y, y_test_pred)

# #Mean Absolute error - test
# # mae_val = mean_absolute_error(y, y_test_pred)

# print('R-Squared_train')
# print('='*60)
# print(r2_train2,"\n")   
# print('Mean absolute error - train')
# print('='*60)
# print(mae_train2,"\n") 

In [None]:
predictions = y_test_pred1
submission = pd.DataFrame({ 'row_id' : test['row_id'], 'congestion': predictions })
submission.shape

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index = False)
submission