In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### **Load Data**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv', index_col=0)

In [None]:
train_df.shape

In [None]:
train_df.head(5)

In [None]:
test_df.shape

In [None]:
test_df.head(5)

In [None]:
train_df.info()

### **Data Preprocessing**

In [None]:
# year, month, day, hour separation

train_df['time'] = train_df['time'].apply(pd.to_datetime)

train_df['year'] = train_df['time'].apply(lambda x : x.year)
train_df['month'] = train_df['time'].apply(lambda x : x.month)
train_df['day'] = train_df['time'].apply(lambda x : x.day)
train_df['hour'] = train_df['time'].apply(lambda x : x.hour)

In [None]:
# Add feature : Weekend or weekday

from datetime import datetime
def weekday(year, month, date):
    
    d = datetime(year, month, date)
    if d.weekday() > 4:
        return 0  # weekend
    else:
        return 1  # weekday
    
train_df['weekday'] = train_df[['year', 'month', 'day']].apply(lambda x : weekday(x['year'], x['month'], x['day']), axis=1)

In [None]:
# Drop features : time, year

train_df.drop(['time', 'year'], axis=1, inplace=True)
train_df.head()

In [None]:
# Label Encoding : Direction

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(train_df['direction'])
train_df['direction'] = le.transform(train_df['direction'])
train_df.head()

In [None]:
# Feature, Target separation

y_train_df = train_df['congestion']
X_train_df = train_df.drop('congestion', axis=1)

In [None]:
X_train_df.head()

In [None]:
y_train_df.head()

In [None]:
# Target Distribution

y_train_df.hist()

In [None]:
# train, validation data split

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_df, y_train_df, test_size=0.2)

In [None]:
# Test Data Preprocessing

# year, month, day, hour separation
test_df['time'] = test_df['time'].apply(pd.to_datetime)

test_df['year'] = test_df['time'].apply(lambda x : x.year)
test_df['month'] = test_df['time'].apply(lambda x : x.month)
test_df['day'] = test_df['time'].apply(lambda x : x.day)
test_df['hour'] = test_df['time'].apply(lambda x : x.hour)


# Add feature : Weekend or weekday
test_df['weekday'] = test_df[['year', 'month', 'day']].apply(lambda x : weekday(x['year'], x['month'], x['day']), axis=1)


# Drop features : time, year
test_df.drop(['time', 'year'], axis=1, inplace=True)


# Label Encoding
test_df['direction'] = le.transform(test_df['direction'])

In [None]:
test_df.head()

In [None]:
test_df.shape

### **Model**

#### **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_val)
mse = mean_squared_error(y_val, pred)
rmse = np.sqrt(mse)

print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))

In [None]:
# Prediction error conrfirmation

def get_top_error_data(y_val, pred, n_tops=5):

    result_df = pd.DataFrame(y_val.values, columns=['real_congestion'])
    result_df['predicted_congestion'] = np.round(pred)
    result_df['diff'] = np.abs(result_df['real_congestion'] - result_df['predicted_congestion'])

    print(result_df.sort_values('diff', ascending=False)[:n_tops])

get_top_error_data(y_val, pred, n_tops=5)

In [None]:
# Coefficient visualization

plt.figure(figsize=(10, 6))
coef = pd.Series(lr_reg.coef_, index=X_train.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### **Ridge, Lasso, ElasticNet**

In [None]:
# Model train with cross validation

from sklearn.model_selection import cross_val_score

def model_train(models, X_train, y_train, cv=5):
    scores = []
    idx = []
    for model in models:
        score = cross_val_score(estimator=model, X=X_train, y=y_train, cv=cv, scoring='neg_mean_absolute_error')
        score.sort()
        scores.append(score)
        idx.append(str(model))
        score_df = pd.DataFrame(data=scores, index=idx, columns=[x for x in range(1, cv+1)])
    
    score_df['mean score'] = score_df.mean(axis=1)
    return score_df

In [None]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet

models_1 = [Lasso(), Ridge(), ElasticNet()]
model_train(models=models_1, X_train=X_train, y_train=y_train)

#### **DecisionTreeRegressor**

In [None]:
from sklearn.tree import DecisionTreeRegressor

models_2 = [DecisionTreeRegressor()]
model_train(models=models_2, X_train=X_train, y_train=y_train)

#### **RandomForestRegressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

models_3 = [RandomForestRegressor(n_estimators=10)]
model_train(models=models_3, X_train=X_train, y_train=y_train)

### **Train & Prediction**

In [None]:
rf_reg = RandomForestRegressor(n_estimators=10)
rf_reg.fit(X_train, y_train)

In [None]:
pred = rf_reg.predict(test_df)
pred = np.round(pred, 0)
test_df['congestion'] = pred

In [None]:
test_df

In [None]:
submission = test_df.reset_index()[['row_id', 'congestion']]
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)