In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.express as px

In [3]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv", parse_dates=['time'])
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", parse_dates=['time'])

elements = [e for e in train_df.columns if e!='row_id' and e!='congestion']
train_df.shape, test_df.shape

In [4]:
train_df.head()

In [5]:
train_df.time.unique() 
# date range : 1991-04-01 ~ 1991-09-30 11:40:00
# time span 20 mins  (00:00:00, 00:20:00, 00:40:00)

In [6]:
test_df.head()

In [7]:
test_df.time.unique() 
# predict congestion on 1991-09-30 between 12:00:00 and 23:40:00

In [8]:
fig,ax = plt.subplots(figsize=(15,10))
ax.plot(train_df['time'], train_df['congestion'])
ax.set_ylabel('Congestion')
ax.set_xlabel('Time')
ax.set_title('congestion level over time')
plt.show()
# no missing data

In [9]:
for col in elements:
    msg = 'column: {:>10}\t Percent of NAN value: {:.2f}%'.format(col, 100 * (train_df[col].isnull().sum()/train_df[col].shape[0]))
    print(msg)
# no null value    

In [10]:
# drop holidays - the date we would predict on is not holiday
from datetime import date
import holidays

def isHoliday(d) :    
    us_holidays = holidays.US()
    return d in us_holidays

In [11]:
train_df.time.dt.date

In [12]:
train_df[train_df["time"].apply(lambda x:isHoliday(x))].time.dt.date.unique()
# holiday - May 27, Jun 4, Sep 2

In [13]:
train_df.shape

In [14]:
train_df.drop(train_df[train_df["time"].apply(lambda x:isHoliday(x))].index, inplace=True)
train_df.shape
# 14040 rows dropped

In [15]:
test_df[test_df["time"].apply(lambda x:isHoliday(x))].time.dt.date.unique()
# there is no holiday in test data

In [16]:
train_df.direction.unique()
#8 directions

In [17]:
train_df.groupby(['x','y']).size()
# 12 location

In [18]:
train_df.groupby(['x','y','direction']).size()
# 65 directions * locations, has same size

In [19]:
test_df.groupby(['x','y','direction']).size()
# 65 directions * locations, has same size

In [20]:
train_df.congestion.value_counts()


In [21]:
#np.unique(train_df.congestion)

#from collections import Counter
#Counter(np.unique(train_df.congestion)).values() # counts the elements' frequency

In [22]:
plt.figure(figsize=(15, 10))
plt.bar(range(101), train_df.congestion.value_counts().sort_index(), width=1,
       color=['red' if con in [0,15,20,21,29,34] else 'green' for con in range(101)])
plt.ylabel('Count')
plt.xlabel('Congestion')
plt.show()

In [23]:
# explorer data for each direction
train_df_pivot = train_df.pivot_table(values='congestion', index=train_df.time, columns='direction', aggfunc='first')
train_df_pivot.head()
# train_df_eb = train_df.loc[train_df.direction=='EB'].filter(['time', 'congestion'], axis=1)

In [24]:
import plotly.express as px
fig = px.area(train_df_pivot, facet_col="direction", facet_col_wrap=2,             
              title='congestion for direction over time')
fig.show()

#SE - congestion is quite stable

In [25]:
train_df_pivot.reset_index(drop=False, inplace=True)
train_df_pivot.head()

In [26]:
fig = px.line(train_df_pivot, x="time", y=train_df_pivot.columns,
              hover_data={"time": "|%B %d, %Y %H:%M"},
              title='congestion for direction over time')
fig.update_xaxes(dtick="M1", tickformat="%b\n%Y")
fig.show()

In [27]:
# date split
train_df["year"] = train_df.time.dt.year
train_df["month"] = train_df.time.dt.month
train_df["day"] = train_df.time.dt.day
#train_df["season"] = np.where(train_df.time.dt.month.isin([3,4,5]), "spring",
#                    np.where(train_df.time.dt.month.isin([6,7,8]), "summer", 
#                    np.where(train_df.time.dt.month.isin([9,10,11]), "autumn", 
#                    np.where(train_df.time.dt.month.isin([12,1,2]), "winter", "none"))))
#train_df.drop("time", axis=1)

test_df["year"] = test_df.time.dt.year
test_df["month"] = test_df.time.dt.month
test_df["day"] = test_df.time.dt.day

train_df.head(), train_df.shape

In [28]:
# normalize 
#congestion = train_df["congestion"]
#base_norm = (congestion - congestion.min()) / (congestion.max() - congestion.min())
#train_df["congestion_norm"] = base_norm
#train_df.head()

## random forest

In [29]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df['direction_num'] = le.fit_transform(train_df['direction'])
test_df['direction_num'] = le.fit_transform(test_df['direction'])

In [30]:
y_train = train_df['congestion']

In [31]:
feature_names = ['year', 'month', 'day', 'x', 'y', 'direction_num']
X_train = train_df[feature_names]

In [32]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

# cross validation
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size=0.3, random_state = 1)

In [33]:
import time
from sklearn.ensemble import RandomForestRegressor
rfModel = RandomForestRegressor(n_estimators=100)

start_time = time.process_time()
rfModel.fit(train_X, train_y)
print(time.process_time() - start_time, " seconds")

rfPredict = rfModel.predict(val_X)
rfMbe = mean_absolute_error(val_y, rfPredict)
print("MAE:", rfMbe)

In [34]:
X_test = test_df[feature_names]
rfPredict = rfModel.predict(X_test)

## LightGBM

In [35]:
import lightgbm as lgb

lgbModel = lgb.LGBMClassifier()

start_time = time.process_time()
lgbModel.fit(train_X, train_y)
print(time.process_time() - start_time, " seconds")

lgbPredict = lgbModel.predict(val_X) 
lgbMbe = mean_absolute_error(val_y, lgbPredict)

print("MAE:", lgbMbe)
print("Accuracy:",metrics.accuracy_score(val_y, lgbPredict))


In [36]:
X_test = test_df[feature_names]
Predict = lgbModel.predict(X_test)

## submission

In [37]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")
submission.head()

In [38]:
submission["congestion"] = rfPredict
submission["congestion"] = submission["congestion"].round().astype(int)
assert(submission["congestion"] >= 0).all()
assert(submission["congestion"] <= 100).all()
submission.head()

In [39]:
submission.to_csv("./submission_{0:.1f}.csv".format(rfMbe), index=False)