In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings("ignore")

# 1. Read the Data

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")

# 2. Data Exploration

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
print("The unique x features :", df_train['x'].unique())

print("The unique y features :", df_train['y'].unique())

print("The unique direction features :", df_train['direction'].unique())

In [None]:
num_time_train = len(df_train['time'].unique())
num_time_test = len(df_test['time'].unique())
print(f'The # of unique time features in train : {num_time_train}')
print(f'The # of unique time features in test: {num_time_test}')

num_x_train = len(df_train['x'].unique())
num_x_test = len(df_test['x'].unique())
print(f'The # of unique x features in train : {num_x_train}')
print(f'The # of unique x features in test: {num_x_test}')


num_y_train = len(df_train['y'].unique())
num_y_test = len(df_test['y'].unique())
print(f'The # of unique y features in train : {num_y_train}')
print(f'The # of unique y features in test: {num_y_test}')

num_direction_train = len(df_train['direction'].unique())
num_direction_test = len(df_test['direction'].unique())
print(f'The # of unique direction features in train : {num_direction_train}')
print(f'The # of unique direction features in test: {num_direction_test}')

# 3. Data's Featues Engineer For EDA 

## 3-1. x-y location encoder

In [None]:
#Create the x-y features
df_train['x+y'] = df_train['x'] * 10 + df_train['y']
df_test['x+y'] = df_test['x'] * 10 + df_test['y']

#x+y = 0 -> x = 0, y = 0
#x+y = 1 -> x = 0, y = 1
#x+y = 2 -> x = 0, y = 2
#x+y = 3 -> x = 0, y = 3
#.....
#if x+y = 23 -> x = 2, y = 3
print("The unique of x+y in training data :", df_train['x+y'].unique())
print("The unique of x+y in testing data :", df_test['x+y'].unique())

In [None]:
x_y_dummies_train = pd.get_dummies(df_train['x+y'])
x_y_dummies_test = pd.get_dummies(df_test['x+y'])

df_train_dum = pd.concat([df_train, x_y_dummies_train], axis = 1)
df_test_dum = pd.concat([df_test, x_y_dummies_test], axis = 1)

## 3-2. Direction features encoder

In [None]:
x_y_dummies_train_direction = pd.get_dummies(df_train['direction'])
x_y_dummies_test_direction = pd.get_dummies(df_test['direction'])

df_train_dum_dir = pd.concat([df_train_dum, x_y_dummies_train_direction], axis = 1)
df_test_dum_dir = pd.concat([df_test_dum, x_y_dummies_test_direction], axis = 1)

In [None]:
df_train_dum_dir = df_train_dum_dir.drop(columns = ['x', 'y', 'direction', 'x+y'])
df_test_dum_dir = df_test_dum_dir.drop(columns = ['x', 'y', 'direction', 'x+y'])

## 3-3. Time's feature engineer

### 3-3-1. Month, Day, Hour, Minute features

In [None]:


#hour
def split_hour(time):
    return time.split(" ")[1].split(":")[0]

df_train_dum_dir['hour'] = df_train_dum_dir['time'].apply(split_hour)
df_test_dum_dir['hour'] = df_test_dum_dir['time'].apply(split_hour)

#minunt
def split_minute(minute):
    return minute.split(":")[1]
df_train_dum_dir['minute'] = df_train_dum_dir['time'].apply(split_minute)
df_test_dum_dir['minute'] = df_test_dum_dir['time'].apply(split_minute)

#day
def split_day(day):
    return day.split("-")[2].split(" ")[0]
df_train_dum_dir['day'] = df_train_dum_dir['time'].apply(split_day)
df_test_dum_dir['day'] = df_test_dum_dir['time'].apply(split_day)


df_train_dum_dir['time'] = pd.to_datetime(df_train_dum_dir['time'])
df_test_dum_dir['time'] = pd.to_datetime(df_test_dum_dir['time'])

#Month
df_train_dum_dir['month'] = df_train_dum_dir['time'].dt.month
df_test_dum_dir['month'] = df_test_dum_dir['time'].dt.month

### 3-3-2. Weekday, Weekend features

In [None]:
#weekday
df_train_dum_dir['weekday'] = df_train_dum_dir['time'].dt.weekday
df_test_dum_dir['weekday'] = df_test_dum_dir['time'].dt.weekday

In [None]:
#weekend
weekend_list = []
for i in df_train_dum_dir['weekday']:
    if i >= 5 :
        weekend_list.append(1)
    else:
        weekend_list.append(0)
df_train_dum_dir['weekend'] = weekend_list

weekend_list = []
for i in df_test_dum_dir['weekday']:
    if i >= 5 :
        weekend_list.append(1)
    else:
        weekend_list.append(0)
df_test_dum_dir['weekend'] = weekend_list

### 3-3-3. Hour + Minute features

In [None]:
df_train_dum_dir['hour+minute'] = df_train_dum_dir['hour'] + df_train_dum_dir['minute']
df_test_dum_dir['hour+minute'] = df_test_dum_dir['hour'] + df_test_dum_dir['minute']

# 4. EDA 

## 4-1. X+Y Feature's EDA

In [None]:
sns.set()
x_y_group = df_train.groupby('x+y').mean()
sns.barplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between x+y and congestion')

## 4-2. Direction Feature's EDA

In [None]:
sns.set()
x_y_group = df_train.groupby('direction').mean()
sns.barplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between direction and congestion')

## 4-3. Weekend, Weekday Feature's EDA

In [None]:
sns.set()
plt.figure(figsize = (10, 10))
x_y_group = df_train_dum_dir.groupby('weekend').mean()
sns.barplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between weekend and congestion')

In [None]:
sns.set()
plt.figure(figsize = (10, 10))
x_y_group = df_train_dum_dir.groupby('weekday').mean()
sns.barplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between weekday and congestion')

## 4-4. Hour+Day Feature's EDA

In [None]:
sns.set()
plt.figure(figsize = (30, 10))
x_y_group = df_train_dum_dir.groupby('hour+minute').mean()
sns.lineplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between hour-minute and congestion')
plt.xticks(rotation = 90)
plt.show()

## 4-5. Month Feature's EDA

In [None]:
sns.set()
plt.figure(figsize = (10, 10))
x_y_group = df_train_dum_dir.groupby('month').mean()
sns.lineplot(x = x_y_group.index, y = x_y_group.congestion)
plt.title('The relation between month and congestion')
plt.xticks(rotation = 90)
plt.show()

# 5. Data Preprocessing for Model Training

In [None]:
train_data = df_train_dum_dir.drop(columns = ['day', 'hour', 'minute', 'row_id', 'time', 'weekday'])
test_data = df_test_dum_dir.drop(columns = ['day', 'hour', 'minute', 'row_id', 'time', 'weekday'])

In [None]:
OHE_encoder = OneHotEncoder()
train_data_hour_min_one = OHE_encoder.fit_transform(train_data[['hour+minute']]).toarray()
test_data_hour_min_one = OHE_encoder.transform(test_data[['hour+minute']]).toarray()

In [None]:
train_data_hour_min_one = pd.DataFrame(train_data_hour_min_one)
test_data_hour_min_one = pd.DataFrame(test_data_hour_min_one)

In [None]:
df_train = pd.concat([train_data, train_data_hour_min_one], axis = 1)
df_test = pd.concat([test_data, test_data_hour_min_one], axis = 1)

In [None]:
train_data = df_train.drop(columns = ['hour+minute'])
test_data = df_test.drop(columns = ['hour+minute'])

# 6. Model Training and Testing!

In [None]:
data = train_data.drop(columns = 'congestion')
target = train_data['congestion']

#Split train_data to train and Validation
x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
sns.set()
GBR = GradientBoostingRegressor(learning_rate=0.06, 
                                max_depth= 20,
                                min_samples_leaf = 2,
                                n_estimators = 2000, 
                                random_state = 10,
                                subsample = 0.2).fit(x_train, y_train)
y_pred_GBR = GBR.predict(x_test)
plt.scatter(y_test, y_pred_GBR)
plt.plot([10*x for x in range(10)], [10*x for x in range(10)], color = 'r')
plt.xlabel("Reality")
plt.ylabel("Predicted")
plt.title('GradientBoostingRegressor')
plt.show()
plt.clf()
print(GBR.score(x_test, y_test))

# 7. Prediction the test_data

In [None]:
y_pred = GBR.predict(test_data)

df_submission = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")
df_submission['congestion'] = y_pred
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission