In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.info()

In [None]:
train['time']= pd.to_datetime(train['time'])
test['time']= pd.to_datetime(test['time'])

In [None]:
print(train.direction.unique())
print(test.direction.unique())

In [None]:
print(train.x.unique())
print(test.x.unique())

In [None]:
print(train.y.unique())
print(test.y.unique())

In [None]:
sns.displot(train['congestion'],kind="hist",height=7,kde=True)

In [None]:
train.info()

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())


In [None]:
# deriving new features from date
def new_date_features(df):
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['day_of_year'] = df['time'].dt.dayofyear
    df['day_of_month'] = df['time'].dt.days_in_month
    df['day_of_week'] = df['time'].dt.dayofweek
    df['weekday'] = df['time'].dt.weekday
    df['hour'] = df['time'].dt.hour
    df['minutes'] = df['time'].dt.minute
    
new_date_features(train)
new_date_features(test)

In [None]:
# dropping time and row id from both train and test
train.drop(['row_id','time'],axis = 1,inplace=True)
test.drop(['row_id','time'],axis = 1,inplace=True)

In [None]:
# def reduce_memory_usage(df):
    
#     start_memory = df.memory_usage().sum() / 1024**2
#     print(f"Memory usage of dataframe is {start_memory} MB")
    
#     for col in df.columns:
#         col_type = df[col].dtype
        
#         if col_type != 'object':
#             c_min = df[col].min()
#             c_max = df[col].max()
            
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)
            
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     pass
#         else:
#             df[col] = df[col].astype('category')
    
#     end_memory = df.memory_usage().sum() / 1024**2
#     print(f"Memory usage of dataframe after reduction {end_memory} MB")
#     print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
#     return df

In [None]:
# train = reduce_memory_usage(train)
# test = reduce_memory_usage(test)

In [None]:
test.info()

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import *

In [None]:
 s = setup(train , target = 'congestion',fold=5, use_gpu=True,silent= True )

In [None]:
best = compare_models(exclude=['et']) # Extra Tree regressor was crashing the notebook because of memory limitations

In [None]:
tuned_rf = tune_model(best,optimize = 'MAE',n_iter=10,choose_better = True) # n_iter can be increased for better results i.e better hyperparameters

In [None]:
print(tuned_rf)

In [None]:
evaluate_model(tuned_rf)

In [None]:
plot_model(tuned_rf, plot = 'residuals')

In [None]:
# plot_model(rf, plot = 'feature')

In [None]:
finalize_model(tuned_rf)

In [None]:
unseen_predictions = predict_model(tuned_rf, data=test)


In [None]:
unseen_predictions

In [None]:
# Thanks to @ambrosm for the idea of rounding the congestion

unseen_predictions['Label'] = unseen_predictions['Label'].apply(np.ceil)
unseen_predictions.head()

In [None]:
assert(len(test.index)==len(unseen_predictions))
submission = pd.DataFrame(list(zip(submission.row_id, unseen_predictions.Label)),columns = ['row_id', 'congestion'])
submission.to_csv('base.csv', index = False)
submission.head()

In [None]:
sns.displot(submission['congestion'],kind="hist",height=7,kde=True)