In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-mar-2022/train.csv
/kaggle/input/tabular-playground-series-mar-2022/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv',index_col=0)

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 848835 entries, 0 to 848834
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time        848835 non-null  object
 1   x           848835 non-null  int64 
 2   y           848835 non-null  int64 
 3   direction   848835 non-null  object
 4   congestion  848835 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 38.9+ MB


# #Data Preprocessing
# 1. x, y, direction => 비슷한 위치 value니까 하나로 합친 후 encoding 수행
# 2. Time -> 주말/주중으로 구분

In [4]:
train_df.head()

Unnamed: 0_level_0,time,x,y,direction,congestion
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1991-04-01 00:00:00,0,0,EB,70
1,1991-04-01 00:00:00,0,0,NB,49
2,1991-04-01 00:00:00,0,0,SB,24
3,1991-04-01 00:00:00,0,1,EB,18
4,1991-04-01 00:00:00,0,1,NB,60


In [5]:
#time이 dtype이 object니까 timedata로 바꾸자.

train_df['time'] = pd.to_datetime(train_df['time'])
train_df['month'] = train_df['time'].dt.month
train_df['day'] = train_df['time'].dt.day
train_df['hour'] = train_df['time'].dt.hour
train_df['minute'] = train_df['time'].dt.minute

train_df.head()


Unnamed: 0_level_0,time,x,y,direction,congestion,month,day,hour,minute
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1991-04-01,0,0,EB,70,4,1,0,0
1,1991-04-01,0,0,NB,49,4,1,0,0
2,1991-04-01,0,0,SB,24,4,1,0,0
3,1991-04-01,0,1,EB,18,4,1,0,0
4,1991-04-01,0,1,NB,60,4,1,0,0


In [6]:
train_df['month'] = train_df['time'].dt.month.astype(int)
pd.to_datetime(train_df['month'], format='%m')
train_df['day'] = train_df['time'].dt.day.astype(int)
pd.to_datetime(train_df['day'],format='%d')
train_df['hour'] = train_df['time'].dt.hour.astype(int)
pd.to_datetime(train_df['hour'],format='%H')
train_df['minute'] = train_df['time'].dt.minute.astype(int)
pd.to_datetime(train_df['minute'],format='%M')


row_id
0        1900-01-01 00:00:00
1        1900-01-01 00:00:00
2        1900-01-01 00:00:00
3        1900-01-01 00:00:00
4        1900-01-01 00:00:00
                 ...        
848830   1900-01-01 00:40:00
848831   1900-01-01 00:40:00
848832   1900-01-01 00:40:00
848833   1900-01-01 00:40:00
848834   1900-01-01 00:40:00
Name: minute, Length: 848835, dtype: datetime64[ns]

In [7]:
train_df['dayofweek'] = train_df['time'].apply(lambda x: x.weekday()) #0 means Monday..
#주중, 주말 0 means weekday, 1 means weekend
train_df['weekday_weekend'] = train_df['dayofweek'].apply(lambda x: 0 if 0<= x <=4 else 1)

train_df.drop(columns = ['time','month', 'day'], inplace = True)

In [8]:
train_df['weekday_weekend'].value_counts()
train_df['dayofweek'].value_counts()

0    124020
6    121680
4    121290
1    121225
2    121095
5    119925
3    119600
Name: dayofweek, dtype: int64

In [9]:
test_df['time'] = pd.to_datetime(test_df['time'])
test_df['month'] = test_df['time'].dt.month
test_df['day'] = test_df['time'].dt.day
test_df['hour'] = test_df['time'].dt.hour
test_df['minute'] = test_df['time'].dt.minute

test_df['month'] = test_df['time'].dt.month.astype(int)
pd.to_datetime(test_df['month'], format='%m')
test_df['day'] = test_df['time'].dt.day.astype(int)
pd.to_datetime(test_df['day'], format='%d')
test_df['hour'] = test_df['time'].dt.hour.astype(int)
pd.to_datetime(test_df['hour'],format='%H')
test_df['minute'] = test_df['time'].dt.minute.astype(int)
pd.to_datetime(test_df['minute'],format='%M')

test_df['dayofweek'] = test_df['time'].apply(lambda x: x.weekday()) #0 means Monday..
#주중, 주말 0 means weekday, 1 means weekend
test_df['weekday_weekend'] = test_df['dayofweek'].apply(lambda x: 0 if 0<= x <=4 else 1)

test_df.drop(columns = ['time', 'month', 'day'], inplace = True)

test_df.head()

Unnamed: 0_level_0,x,y,direction,hour,minute,dayofweek,weekday_weekend
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
848835,0,0,EB,12,0,0,0
848836,0,0,NB,12,0,0,0
848837,0,0,SB,12,0,0,0
848838,0,1,EB,12,0,0,0
848839,0,1,NB,12,0,0,0


# Direction과 x,y 합쳤다가 망했으니 그러지 말자.

# Encoding Time!

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoded = encoder.fit_transform(train_df['direction'])
train_df['direction'] = encoded




In [11]:
test_df['x'].value_counts()


2    936
1    792
0    612
Name: x, dtype: int64

In [12]:
test_df['direction'] = encoder.transform(test_df['direction'])

# DATA SPLIT

In [13]:
from sklearn.model_selection import train_test_split
y_train_df = train_df['congestion']
X_train_df = train_df.drop('congestion', axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_df, y_train_df, test_size=0.2, shuffle=False, random_state=0)

In [14]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(679068, 7) (679068,)
(169767, 7) (169767,)


# Modeling

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error

import time


In [16]:
linr = LinearRegression()
knn = KNeighborsRegressor(n_neighbors = 15)
rf = RandomForestRegressor(n_estimators=20, max_depth=10,random_state=0)
lgbm = LGBMRegressor(learning_rate=0.05, max_depth=6, n_estimators=300)
xgb = XGBRegressor(learning_rate=0.5,max_depth=10,n_estimators=100)
cat = CatBoostRegressor(iterations=2,learning_rate=1,depth=2)
models = [linr, knn, rf, lgbm, xgb, cat]

In [17]:
for model in models:
    start_time=time.time()
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    score=mean_absolute_error(val_pred, y_val)
    print(f'{model.__class__.__name__} : {score}, total time: {time.time()-start_time}')
    

LinearRegression : 13.534435768655669, total time: 0.1538848876953125
KNeighborsRegressor : 6.79658590892223, total time: 12.292938470840454
RandomForestRegressor : 7.072774287776391, total time: 14.640065908432007
LGBMRegressor : 7.207969572576376, total time: 4.869458913803101
XGBRegressor : 6.676641783029263, total time: 34.8335382938385
0:	learn: 16.3722408	total: 97.4ms	remaining: 97.4ms
1:	learn: 15.7993265	total: 131ms	remaining: 0us
CatBoostRegressor : 12.93215140298916, total time: 0.3412191867828369


# #Prediction

In [18]:
final_model = XGBRegressor(learning_rate=0.5,max_depth=15,n_estimators=200)
final_model.fit(X_train, y_train)
final_pred = final_model.predict(test_df)
final_pred = np.round(final_pred,0)
test_df['congestion'] = final_pred

In [19]:
test_df.head()

Unnamed: 0_level_0,x,y,direction,hour,minute,dayofweek,weekday_weekend,congestion
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
848835,0,0,0,12,0,0,0,45.0
848836,0,0,1,12,0,0,0,35.0
848837,0,0,4,12,0,0,0,56.0
848838,0,1,0,12,0,0,0,22.0
848839,0,1,1,12,0,0,0,73.0


In [20]:
submission = test_df.reset_index()[['row_id','congestion']]
submission.head()

Unnamed: 0,row_id,congestion
0,848835,45.0
1,848836,35.0
2,848837,56.0
3,848838,22.0
4,848839,73.0


In [21]:
submission.to_csv('submission.csv', index=False)