In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import calendar

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
df.head()

In [None]:
test.head()

In [None]:
df.info()

In [None]:
df.set_index('row_id', inplace= True)
test.set_index('row_id', inplace = True)

In [None]:
df['time'].iloc[0]

In [None]:
def get_date(x):
    return x.split()[0]
df['date'] = df['time'].apply(get_date)
test['date'] = test['time'].apply(get_date)

def get_yr(x):
    return int(x.split('-')[0])

def get_month(x):
    return int(x.split('-')[1])

def get_date(x):
    return int(x.split('-')[2])

df['year'] = df['date'].apply(get_yr)
df['month']= df['date'].apply(get_month)
df['date'] = df['date'].apply(get_date)
test['year'] = test['date'].apply(get_yr)
test['month'] = test['date'].apply(get_month)
test['date']= test['date'].apply(get_date)

In [None]:
def reduce_time(x):
    return x.split()[1]

df['time'] = df['time'].apply(reduce_time)
test['time'] = test['time'].apply(reduce_time)

def get_hour(x):
    return int(x.split(':')[0])

def get_minute(x):
    return int(x.split(':')[1])

df['hour'] = df['time'].apply(get_hour)
df['minute'] = df['time'].apply(get_minute)

test['hour'] = test['time'].apply(get_hour)
test['minute'] = test['time'].apply(get_minute)


In [None]:
df

In [None]:
df.drop('time', axis=1, inplace = True)
test.drop('time', axis = 1, inplace = True)

In [None]:
df

In [None]:
def weekend_or_weekday(year, month, date):
    d = datetime(year, month, date)
    if d.weekday()>4:
        return 0
    else:
        return 1
    
df['weekday'] = df.apply(lambda x:weekend_or_weekday(x['year'],x['month'],x['date']),axis = 1)
test['weekday'] = test.apply(lambda x:weekend_or_weekday(x['year'], x['month'], x['date']),axis = 1)

In [None]:
def get_day(year, month, date):
    day = datetime(year, month, date).weekday()
    return (calendar.day_name[day])

df['day'] = df.apply(lambda x: get_day(x['year'], x['month'],x['date']),axis = 1)
test['day'] = test.apply(lambda x: get_day(x['year'],x['month'],x['date']),axis = 1)

In [None]:
def am_or_pm(x):
    if x>=12:
        return 1
    else: 
        return 0
    
df['PM'] = df['hour'].apply(am_or_pm)
test['PM'] = test['hour'].apply(am_or_pm)

In [None]:
df.head()

In [None]:
df['month'].unique()

In [None]:
cnts = list(df.drop(['congestion', 'year', 'minute'],axis=1).columns)

k=0
fig,ax_arr = plt.subplots(nrows=5, ncols=2, figsize = (25,30))
for i in range(5):
    for j in range(2):
        if i==4 and j==1:
            ax_arr[i][j].axis('off')
            break
        sns.countplot(x=cnts[k], data= df, ax=ax_arr[i][j])
        ax_arr[i][j].set_xlabel(cnts[k])
        k+=1

**Inferences:**
* Higher valued co-ordinates are present more in the dataset
* NW and SE are the least travelled directions.

In [None]:
cnts = list(df.drop(['congestion','year','minute',],axis=1).columns)

k=0
fig,ax_arr = plt.subplots(nrows=5, ncols=2, figsize=(25,30))
for i in range(5):
    for j in range(2):
        if i==4 and j==1:
            ax_arr[i][j].axis('off')
            break
        sns.boxplot(x=cnts[k], y='congestion', data = df, ax=ax_arr[i][j])
        ax_arr[i][j].set_xlabel(cnts[k])
        ax_arr[i][j].set_ylabel('Congestion')
        k+=1

In [None]:
cnts = list(df.drop(['congestion','year','minute',],axis=1).columns)

k=0
fig,ax_arr = plt.subplots(nrows=5, ncols=2, figsize=(25,30))
for i in range(5):
    for j in range(2):
        if i==4 and j==1:
            ax_arr[i][j].axis('off')
            break
        sns.violinplot(x=cnts[k], y='congestion', data = df, ax=ax_arr[i][j])
        ax_arr[i][j].set_xlabel(cnts[k])
        ax_arr[i][j].set_ylabel('Congestion')
        k+=1

From the above violinplot we can infer that the data has not been distributed in a normalized manner. 

In [None]:
plt.figure(dpi = 150, figsize=(15,12))
sns.catplot(x = 'day', y='congestion', data = df, hue='PM', kind = 'violin', split=True)

# Time Series Visualization

A major portion of the code has been repeated in each cell. I have written a function for it and I will upload it in the upcoming versions

In [None]:
avg_congestion_per_month = {x:0 for x in range(4,10)}

# getting the total congestion value for a month
for i in range(len(df)):
    avg_congestion_per_month[df['month'].iloc[i]] += df['congestion'].iloc[i]
    
# getting the average congestion value for a month
for i in range(4,10):
    avg_congestion_per_month[i] /= len(df[df['month']==i])
    
#plotting the average congestion
plt.figure(figsize=(10,6))
plt.plot([4,5,6,7,8,9],avg_congestion_per_month.values(), marker='o',linestyle='--')
plt.xlabel('MONTHS')
plt.ylabel('AVG CONGESTION')
plt.title('VARIATION OF CONGESTION WRT MONTH');

The month of june has a low congestion value

In [None]:
#avg congestion per month but this time based on part of the day
avg_congestion_per_month_wd = {x:0 for x in range(4,10)}
avg_congestion_per_month_we = {x:0 for x in range(4,10)}

#avg congestion for weekend and weekday
for i in range(len(df)):
    if df['weekday'].iloc[i] == 1:
        avg_congestion_per_month_wd[df['month'].iloc[i]] += df['congestion'].iloc[i]
    else:
        avg_congestion_per_month_we[df['month'].iloc[i]] += df['congestion'].iloc[i]
        
    
for i in range(4,10):
    avg_congestion_per_month_wd[i] /= len(df[(df['month']==i)&(df['weekday']==0)])
    
plt.figure(figsize=(10,6))
plt.plot([4,5,6,7,8,9],avg_congestion_per_month_wd.values(),label='Weekday', marker = 's', linestyle='--',c='b')
plt.xlabel('MONTH')
plt.ylabel('AVG CONGESTION')

plt.plot([4,5,6,7,8,9], avg_congestion_per_month_we.values(), label='Weekend', marker='o',linestyle='--', c='r')

plt.title('Weekday and Weekend comparison')
plt.legend(loc='best')

There is a considerable difference in the avg congestion for weekday and weekend.

In [None]:
#we have got it for month and weekday and weekend. Now lets do it for part of the day
avg_congestion_per_month_am = {x:0 for x in range(4,10)}
avg_congestion_per_month_pm = {x:0 for x in range(4,10)}


for i in range(len(df)):
    if df['PM'].iloc[i] == 1:
        avg_congestion_per_month_pm[df['month'].iloc[i]] += df['congestion'].iloc[i]
    else:
        avg_congestion_per_month_am[df['month'].iloc[i]] += df['congestion'].iloc[i]
        
    
for i in range(4,10):
    avg_congestion_per_month_pm[i] /= len(df[(df['month']==i)&(df['PM']==1)])
    avg_congestion_per_month_am[i] /= len(df[(df['month']==i)&(df['PM']==0)])

plt.figure(figsize=(10,6))
plt.plot([4,5,6,7,8,9],avg_congestion_per_month_pm.values(),label='PM', marker = 'o', linestyle='--',c='r')
plt.xlabel('MONTH')
plt.ylabel('AVG CONGESTION')


plt.title('AM and PM comparison')
plt.legend(loc='best');

In [None]:
avg_congestion_per_hour = {x:0 for x in range(24)}
for i in range(len(df)):
    avg_congestion_per_hour[df['hour'].iloc[i]] += df['congestion'].iloc[i]
for i in range(24):
    avg_congestion_per_hour[i] /= len(df[df['hour'] == i])
    
plt.plot([x for x in range(24)],avg_congestion_per_hour.values(), marker='o')
plt.xlabel('Hour')
plt.ylabel('Avg Congestion')
plt.title('Avg Congestion per hour');

In [None]:
# let's try to find the avg of congestion for all possible combinations of the 'x' and 'y' and co-ordinates
comb_x_y = {x:[] for x in range(4,10)}
cols = []

for i in range(6): # for months -> 4,5,6,7,8,9
    for j in range(3): # for x-coorinates -> 0,1,2
        for k in range(4):  # for y-coordinates -> 0,1,2,3
            req_sum = df[(df['x']==j)&(df['y']==k)&(df['month']==i+4)]['congestion'].sum()  # congestion sum for x and y
            req_mean = req_sum/len(df[(df['x']==j)&(df['y']==k)&(df['month']==i+4)]) # congestion mean 
            comb_x_y[i+4].append(req_mean)
            if i ==0:
                cols.append('x_'+str(j)+'_y_'+str(k))
                
# creating a dataframe from the mean congestion values
#obtained so that we can visualize the data
comb_x_y_df = pd.DataFrame(comb_x_y).transpose()
comb_x_y_df.columns = cols
comb_x_y_df

In [None]:
# plotting the avg congestion data obtained above
col = 0

fig,ax_arr = plt.subplots(figsize=(25,30),nrows=6,ncols= 2)

for i in range(6):
    for j in range(2):
        c = comb_x_y_df.columns[col]
        x,y = c[2],c[6]
        ax_arr[i][j].plot(range(4,10),comb_x_y_df[c],marker='o',linestyle='--')
        ax_arr[i][j].set_xlabel('MONTH')
        ax_arr[i][j].set_ylabel('AVG CONGESTION')
        ax_arr[i][j].set_title(f'x={x} and y={y}')
        col+=1
plt.tight_layout()

In [None]:
# hour vs avg congestion with weekday as hue
week_dict = {'Monday':[],'Tuesday':[],'Wednesday':[],'Thursday':[],
            'Friday':[],'Saturday':[],'Sunday':[]}
for i in week_dict.keys():
    for j in range(24):
        req_sum = df[(df['day']==i)&(df['hour']==j)]['congestion'].sum()
        req_mean = req_sum/len(df[(df['day']==i)&(df['hour']==j)])
        week_dict[i].append(req_mean)
week_avg_cong_df = pd.DataFrame(data=week_dict)
week_avg_cong_df

In [None]:
plt.figure(figsize=(10,6))
plt.grid()
for i in week_dict.keys():
    plt.plot(range(0,24), week_avg_cong_df[i],label=i)
    plt.xlabel('HOUR')
    plt.ylabel('AVG CONGESTION')
    plt.legend();

In [None]:
direction_dict = {x:[] for x in range(4,10)}
directions = list(df['direction'].unique())

for d in range(4,10):
    for direction in directions:
        req_sum = df[(df['month']==d)&(df['direction']==direction)]['congestion'].sum()
        req_mean = req_sum/len(df[(df['month']==d)&(df['direction']==direction)])
        direction_dict[d].append(req_mean)
        
direction_dict_df = pd.DataFrame(direction_dict).transpose()
direction_dict_df.columns = directions

plt.figure(figsize=(15,8))
plt.grid()
for direction in directions:
    plt.plot(range(4,10), direction_dict_df[direction], label=direction)
    plt.xlabel('MONTH')
    plt.ylabel('AVG CONGESTION')
    plt.legend();

In [None]:
days_dict = {x:[] for x in range(4,10)}
for d in range(4,10):
    for date in range(1,32):
        req_sum = df[(df['month']==d)&(df['date']==date)]['congestion'].sum()
        req_mean = req_sum/len(df[(df['month']==d)&(df['date']==date)])
        days_dict[d].append(req_mean)
        
days_dict_df = pd.DataFrame(days_dict).transpose()
days_dict_df.columns = list(x for x in range(1,32))

plt.figure(figsize=(15,8))
plt.grid()
for d in range(1,32):
    plt.plot(range(4,10),days_dict_df[d],label=d)
    plt.xlabel('MONTH')
    plt.ylabel('AVG CONGESTION')
    plt.legend(loc=(1.1,0.2));

In [None]:
df['direction'].nunique()

# Preprocessing

In [None]:
df

In [None]:
#drop the year, month and date colum as they are not necessary
df.drop(['year','month','date'],axis=1, inplace=True)
test.drop(['year','month','date'],axis=1,inplace=True)

In [None]:
X = df.drop('congestion',axis = 1)
y = df['congestion']

In [None]:
X['x'],X['y'] = X['x'].astype(str),X['y'].astype(str)
test['x'],test['y'] = test['x'].astype(str),test['y'].astype(str)
X['hour'] = X['hour']/23
X['minute'] = X['minute']/59
test['hour'] = test['hour']/23
test['minute'] = test['minute']/59

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X['direction'] = le.fit_transform(X['direction'])
test['direction'] = le.transform(test['direction'])
le = LabelEncoder()
X['day'] = le.fit_transform(X['day'])
test['day'] = le.transform(test['day'])

In [None]:
pd.get_dummies(X).columns.difference(pd.get_dummies(test).columns)

In [None]:
X = pd.get_dummies(X)
test = pd.get_dummies(test)

In [None]:
X.shape,test.shape

In [None]:
X_train, X_test ,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Model Prep

In [None]:
def initial_run(models, X_train, y_train,cv=5):
    scores = []
    ind = []
    for model in models:
        score = cross_val_score(estimator=model,X=X_train,y=y_train,cv=cv,scoring='neg_mean_absolute_error')
        score.sort()
        scores.append(score)
        ind.append(str(model))
    score_df = pd.DataFrame(data = scores,index= ind, columns = [x for x in range(1,cv+1)])
    return score_df

In [None]:
models_1 = [Lasso(), Ridge(), ElasticNet()]
initial_run(models=models_1, X_train = X_train, y_train=y_train)

In [None]:
models_2 = [DecisionTreeRegressor()]
initial_run(models=models_2,X_train=X_train,y_train=y_train)

In [None]:
#models_4 =[ExtraTreesRegressor(n_estimators = 30)]
#initial_run(models= models_4,X_train = X_train,y_train=y_train)

In [None]:
models_3 = [RandomForestRegressor(n_estimators = 30)]
initial_run(models=models_3,X_train=X_train,y_train=y_train)

In [None]:
models_6 =[LinearRegression()]
initial_run(models=models_6, X_train=X_train, y_train=y_train)

In [None]:
#models_5 = [AdaBoostClassifier()]
#initial_run(models = models_5, X_train=X_train,y_train=y_train)

In [None]:
#models_6 = [CatBoostRegressor()]
#initial_run(models = models_6, X_train = X_train,y_train=y_train)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
pred = lr.predict(test)

In [None]:
pred= np.round(pred,0)

In [None]:
test['congestion'] = pred

In [None]:
submission = pd.DataFrame(test['congestion'])
submission.to_csv('submission.csv')