# Competition

[Kaggle Link](https://www.kaggle.com/competitions/tabular-playground-series-mar-2022/data)

# Tanım

- Tarih verilmiş
- x, y verilmiş 
- yön verilmiş
- trafik verilmiş/isteniyor
  
# Preprocessing

- Yön verisini One-Hot yapmak
- NaN yok
- Normalize bir veri
- Time verisini kullanılabilecek bir hale getirmek
  - Her 20 dakikayı ayırmak
  - Her saati ayırmak
  - Her günü ayırmak
  - Gün verisinden kurtulup sadece veri üstünde çalışmak
  
# Gruplar

## Preprocessing

- Ayfer Sinem Çoban
- Onur Ümit Şener
  
## Modelling

- Ata Güneş
- Mertcan Duran
- Oğulcan Akca
  
## Presentation

- Başak Topçuoğlu
- Saitcan Yıldırım


# Code

## Imports

In [1]:
import datetime

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

## Data Init

In [2]:
raw_data = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')

In [3]:
INTERVAL_20_MINUTES = 65
INTERVAL_HOUR = 260
INTERVAL_DAY = 4680
INTERVAL_WEEK = 32760
TOTAL_ROWS = raw_data.shape[0]
INTERVALS = (INTERVAL_20_MINUTES,
             INTERVAL_HOUR,
             INTERVAL_DAY,
             INTERVAL_WEEK
             )

## Datetime Conversion


In [4]:
raw_data['time'] = pd.to_datetime(raw_data['time'])

In [5]:
df = raw_data.drop('row_id', axis=1).copy()

## Adding Hours, Minutes, Months

In [6]:
hours_list = []
minutes_list = []
month_list = []
season_list = []
for t in df['time']:
    hours_list.append(t.hour)
    minutes_list.append(t.minute // 20)
    month_list.append(t.month)


In [7]:
time_df = pd.DataFrame({'hours': hours_list,
                        'minutes': minutes_list,
                        'month': month_list,
                        })
df = pd.concat([df, time_df], axis=1)

In [8]:
df = df[['time', 'hours', 'minutes', 'month', 'x', 'y', 'direction', 'congestion']]

In [9]:
df.sample(5)

Unnamed: 0,time,hours,minutes,month,x,y,direction,congestion
632869,1991-08-14 13:40:00,13,2,8,1,2,NE,61
143893,1991-05-01 20:20:00,20,1,5,2,1,SE,34
573378,1991-08-01 20:00:00,20,0,8,0,3,NE,50
473649,1991-07-11 11:20:00,11,1,7,2,3,EB,44
735144,1991-09-05 16:40:00,16,2,9,2,3,EB,50


In [10]:
df.tail()

Unnamed: 0,time,hours,minutes,month,x,y,direction,congestion
848830,1991-09-30 11:40:00,11,2,9,2,3,NB,54
848831,1991-09-30 11:40:00,11,2,9,2,3,NE,28
848832,1991-09-30 11:40:00,11,2,9,2,3,SB,68
848833,1991-09-30 11:40:00,11,2,9,2,3,SW,17
848834,1991-09-30 11:40:00,11,2,9,2,3,WB,24


### Adding Days

In [11]:
weekdays = ("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
days = {}
days_list = [None] * TOTAL_ROWS
for i in range(7):
    days[i] = range(i, TOTAL_ROWS, 7)
for j in range(7):   
    for k in days[j]:
        days_list[k] = weekdays[j]
df.insert(1, 'days', days_list)

In [12]:
df.sample(5)

Unnamed: 0,time,days,hours,minutes,month,x,y,direction,congestion
346845,1991-06-14 08:00:00,Wednesday,8,0,6,0,1,SB,62
463875,1991-07-09 09:20:00,Sunday,9,1,7,1,3,NE,44
769331,1991-09-13 04:00:00,Thursday,4,0,9,2,2,SE,40
43231,1991-04-10 05:40:00,Sunday,5,2,4,0,1,WB,32
523842,1991-07-22 06:00:00,Friday,6,0,7,0,2,EB,45


In [13]:
df.tail()

Unnamed: 0,time,days,hours,minutes,month,x,y,direction,congestion
848830,1991-09-30 11:40:00,Thursday,11,2,9,2,3,NB,54
848831,1991-09-30 11:40:00,Friday,11,2,9,2,3,NE,28
848832,1991-09-30 11:40:00,Saturday,11,2,9,2,3,SB,68
848833,1991-09-30 11:40:00,Sunday,11,2,9,2,3,SW,17
848834,1991-09-30 11:40:00,Monday,11,2,9,2,3,WB,24


## One-Hot

In [14]:
# df_oh['time'].str.get_dummies(' ') # Another method for one-hot, might be useful

In [15]:
df

Unnamed: 0,time,days,hours,minutes,month,x,y,direction,congestion
0,1991-04-01 00:00:00,Monday,0,0,4,0,0,EB,70
1,1991-04-01 00:00:00,Tuesday,0,0,4,0,0,NB,49
2,1991-04-01 00:00:00,Wednesday,0,0,4,0,0,SB,24
3,1991-04-01 00:00:00,Thursday,0,0,4,0,1,EB,18
4,1991-04-01 00:00:00,Friday,0,0,4,0,1,NB,60
...,...,...,...,...,...,...,...,...,...
848830,1991-09-30 11:40:00,Thursday,11,2,9,2,3,NB,54
848831,1991-09-30 11:40:00,Friday,11,2,9,2,3,NE,28
848832,1991-09-30 11:40:00,Saturday,11,2,9,2,3,SB,68
848833,1991-09-30 11:40:00,Sunday,11,2,9,2,3,SW,17


In [16]:
df_oh = pd.get_dummies(df, dtype=int)
df_oh.drop('time', axis=1, inplace=True)

In [17]:
df_oh

Unnamed: 0,hours,minutes,month,x,y,congestion,days_Friday,days_Monday,days_Saturday,days_Sunday,...,days_Tuesday,days_Wednesday,direction_EB,direction_NB,direction_NE,direction_NW,direction_SB,direction_SE,direction_SW,direction_WB
0,0,0,4,0,0,70,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,4,0,0,49,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,0,4,0,0,24,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,4,0,1,18,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,4,0,1,60,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848830,11,2,9,2,3,54,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
848831,11,2,9,2,3,28,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
848832,11,2,9,2,3,68,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
848833,11,2,9,2,3,17,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [18]:
df_oh.columns


Index(['hours', 'minutes', 'month', 'x', 'y', 'congestion', 'days_Friday',
       'days_Monday', 'days_Saturday', 'days_Sunday', 'days_Thursday',
       'days_Tuesday', 'days_Wednesday', 'direction_EB', 'direction_NB',
       'direction_NE', 'direction_NW', 'direction_SB', 'direction_SE',
       'direction_SW', 'direction_WB'],
      dtype='object')

# Test


In [19]:
raw_data = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')

In [20]:
INTERVAL_20_MINUTES = 65
INTERVAL_HOUR = 260
INTERVAL_DAY = 4680
INTERVAL_WEEK = 32760
TOTAL_ROWS = raw_data.shape[0]
INTERVALS = (INTERVAL_20_MINUTES,
             INTERVAL_HOUR,
             INTERVAL_DAY,
             INTERVAL_WEEK
             )

In [21]:
raw_data['time'] = pd.to_datetime(raw_data['time'])

In [22]:
df = raw_data.drop('row_id', axis=1).copy()

In [23]:
hours_list = []
minutes_list = []
month_list = []
season_list = []
for t in df['time']:
    hours_list.append(t.hour)
    minutes_list.append(t.minute // 20)
    month_list.append(t.month)


In [24]:
time_df = pd.DataFrame({'hours': hours_list,
                        'minutes': minutes_list,
                        'month': month_list,
                        })
df = pd.concat([df, time_df], axis=1)

In [25]:
df = df[['time', 'hours', 'minutes', 'month', 'x', 'y', 'direction']]

In [26]:
weekdays = ("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
days = {}
days_list = [None] * TOTAL_ROWS
for i in range(7):
    days[i] = range(i, TOTAL_ROWS, 7)
for j in range(7):   
    for k in days[j]:
        days_list[k] = weekdays[j]
df.insert(1, 'days', days_list)

In [27]:
df_oh_t = pd.get_dummies(df, dtype=int)
df_oh_t.drop('time', axis=1, inplace=True)

In [28]:
df_oh_t

Unnamed: 0,hours,minutes,month,x,y,days_Friday,days_Monday,days_Saturday,days_Sunday,days_Thursday,days_Tuesday,days_Wednesday,direction_EB,direction_NB,direction_NE,direction_NW,direction_SB,direction_SE,direction_SW,direction_WB
0,12,0,9,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,12,0,9,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,12,0,9,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,12,0,9,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
4,12,0,9,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,23,2,9,2,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2336,23,2,9,2,3,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2337,23,2,9,2,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2338,23,2,9,2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


# Export

Run this code to get the _.csv_ of the result

In [29]:
# df_oh.to_csv('df.csv')

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense


# Split the data into input features and target variable
X = df_oh.drop("congestion", axis=1).values
y = df_oh["congestion"].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

# Scale the input features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Reshape the input features to match the expected input shape of the LSTM network
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_val_reshaped = np.reshape(X_val_scaled, (X_val_scaled.shape[0], X_val_scaled.shape[1], 1))

# Build the LSTM network
model = Sequential()
model.add(LSTM(64, input_shape=(X_train_reshaped.shape[1], 1)))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train_reshaped, y_train, batch_size=36, epochs=10, validation_data=(X_val_reshaped, y_val))

# Load and preprocess the test data
test_data = df_oh_t
# Perform any necessary preprocessing steps on the test data

# Scale the test data
test_data_scaled = scaler.transform(test_data)

# Reshape the test data
test_data_reshaped = np.reshape(test_data_scaled, (test_data_scaled.shape[0], test_data_scaled.shape[1], 1))

# Make predictions on the test data
predictions = model.predict(test_data_reshaped)

# Save or use the predictions as needed
submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')
submission.congestion = predictions
submission.to_csv("sample_submission.csv",index=False)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




