In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the Libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# Loading the dataset

In [None]:
# Train data
df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
df.shape

In [None]:
df.head()

In [None]:
# Test data
df1 = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
df1.shape

In [None]:
df1.head()

In [None]:
# dropping the row_id since it looks like index
del df['row_id']
del df1['row_id']

# Basic Exploratory Data Analysis

In [None]:
# checking for null values
df.isna().sum()

In [None]:
# checking for duplicate values
df.duplicated().sum()

In [None]:
# convert time column to datetime
df['time'] = pd.to_datetime(df['time'])

In [None]:
# descriptive stats - numerical
df.describe(datetime_is_numeric=True)

In [None]:
# descriptive stats - categorical
df.describe(include='object')

In [None]:
# unique items
df.nunique()

In [None]:
# count to unique ratio
df.count()/df.nunique()

In [None]:
# Distribution plot
sns.set_style("whitegrid")
plt.figure(figsize=(25,10))
j=1
for i in df.select_dtypes(exclude=['object']):
    plt.subplot(3,2,j)
    sns.distplot(df[i],color='blue')
    j=j+1

In [None]:
# Box plot
sns.set_style("whitegrid")
plt.figure(figsize=(25,10))
j=1
for i in ['x', 'y', 'congestion'] :
    plt.subplot(3,3,j)
    sns.boxplot(x=df[i],palette='Set2')
    j=j+1

In [None]:
# Count plot
sns.set_style("dark")
plt.figure(figsize=(25,10))
j=1
for i in ['direction'] :
    plt.subplot(3,1,j)
    sns.countplot(y=df[i],palette='Set2')
    j=j+1

In [None]:
# congestion vs direction
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.violinplot(x=df['direction'],y=df['congestion'],palette='Set3')
j=j+1

In [None]:
# congestion Vs (x,y)
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,2,j)
sns.boxplot(x=df['x'],y=df['congestion'],palette='rainbow')
j=j+1
plt.subplot(3,2,j)
sns.boxplot(x=df['y'],y=df['congestion'],palette='rainbow')

In [None]:
# Congestion over the period of time
dft = df.groupby('time')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft['time'],y=dft['congestion'],palette='Set1')

In [None]:
# Congestion in a month
dft1 = dft[dft['time'].astype(str).str.contains('1991-04')]
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft1['time'],y=dft1['congestion'],palette='Set1')

In [None]:
# Congestion in a day
dft1 = dft[dft['time'].astype(str).str.contains('1991-04-01')]
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft1['time'],y=dft1['congestion'],palette='Set1')

In [None]:
# Congestion in a hour
dft1 = dft[dft['time'].astype(str).str.contains('1991-04-01 1')]
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft1['time'],y=dft1['congestion'],palette='Set1')

In [None]:
# unique items in time - checking how it is
df['time'].unique()[0:25]

In [None]:
# checking value counts 
df['time'].value_counts().unique()

# each time stamp has (day, month, year, time) has 65 occurences

# Feature Engineering

In [None]:
from calendar import monthcalendar
import datetime
def week_of_month(year, month, day):
    weekday_of_day_one = datetime.date(year, month, 1).weekday()
    weekday_of_day = datetime.date(year, month, day).weekday()
    return (day - 1)//7 +1

In [None]:
# Splitting the timestamp to day, month, year and time

df['day'] = pd.to_datetime(df['time']).dt.day
df['year'] = pd.to_datetime(df['time']).dt.year
df['month'] = pd.to_datetime(df['time']).dt.month
df['hour'] = pd.to_datetime(df['time']).dt.time
df['week'] = pd.to_datetime(df['time']).dt.week
df['day_name'] = pd.to_datetime(df['time']).dt.day_name()
df['month_week'] = df['time'].apply(lambda x : week_of_month(x.year, x.month, x.day))
df['weekend'] = np.where((df['day_name']=='Sunday') | (df['day_name']=='Saturday') ,1,0)

df.head()

In [None]:
df1['day'] = pd.to_datetime(df1['time']).dt.day
df1['year'] = pd.to_datetime(df1['time']).dt.year
df1['month'] = pd.to_datetime(df1['time']).dt.month
df1['hour'] = pd.to_datetime(df1['time']).dt.time
df1['week'] = pd.to_datetime(df1['time']).dt.week
df1['day_name'] = pd.to_datetime(df1['time']).dt.day_name()
df1['month_week'] = pd.to_datetime(df1['time']).apply(lambda x : week_of_month(x.year, x.month, x.day))
df1['weekend'] = np.where((df1['day_name']=='Sunday') | (df1['day_name']=='Saturday') ,1,0)


In [None]:
df.nunique()

In [None]:
# delete year column since the data has only one year
del df['year']
del df1['year']

# Understanding the data better

In [None]:
# congestion vs month and day
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,2,j)
sns.boxplot(x=df['month'],y=df['congestion'],palette='rainbow')
j=j+1
plt.subplot(3,2,j)
sns.boxplot(x=df['day'],y=df['congestion'],palette='rainbow')

In [None]:
# congestion vs week and day_name
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,2,j)
sns.boxplot(x=df['week'],y=df['congestion'],palette='rainbow')
j=j+1
plt.subplot(3,2,j)
sns.boxplot(x=df['day_name'],y=df['congestion'],palette='rainbow')

In [None]:
# congestion vs week and day_name
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,2,j)
sns.boxplot(x=df['month_week'],y=df['congestion'],palette='rainbow')
j=j+1
plt.subplot(3,2,j)
sns.boxplot(x=df['weekend'],y=df['congestion'],palette='rainbow')

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.boxplot(x=df['hour'],y=df['congestion'],palette='rainbow')

In [None]:
# Average congestion in a monthly basis
dft = df.groupby('month')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
print(dft)
sns.lineplot(x=dft['month'],y=dft['congestion'],palette='Set1')

In [None]:
# Average congestion in week wise
dft = df.groupby('week')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft['week'],y=dft['congestion'],palette='Set1')

In [None]:
# Average congestion in day wise
dft = df.groupby('day')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.lineplot(x=dft['day'],y=dft['congestion'],palette='Set1')

In [None]:
# Average congestion vs day_name
dft = df.groupby('day_name')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.barplot(y=dft['day_name'],x=dft['congestion'],palette='Set3')

In [None]:
# Average congestion vs month_week
dft = df.groupby('month_week')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.barplot(y=dft['month_week'],x=dft['congestion'],palette='Set1',orient='h')

In [None]:
# Average congestion vs Weekend
dft = df.groupby('weekend')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.barplot(y=dft['weekend'],x=dft['congestion'],palette='Set1',orient='h')

In [None]:
# Average congestion in hour wise
dft = df.groupby('hour')['congestion'].mean().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(25,15))
j=1
plt.subplot(3,1,j)
sns.barplot(x=dft['hour'].astype(str),y=dft['congestion'],palette='rainbow')