# Importing Libraries and Loading datasets

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col="row_id", parse_dates=['time'])
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col="row_id", parse_dates=['time'])
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

# Explore Data

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Columns: \n{0}".format(list(train.columns)))

# Basic Data Check

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

## Missing values

In [None]:
missing_values_train = train.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Duplicates

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

# Feature Engineering
## Credits to https://www.kaggle.com/martynovandrey/tps-mar-22-fe-model-selection

In [None]:
def add_road_feature(df):
    df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']
    return df.drop(['x', 'y', 'direction'], axis=1)
train = add_road_feature(train)
test = add_road_feature(test)

le = LabelEncoder()
train['road'] = le.fit_transform(train['road'])
test['road']  = le.transform(test['road'])

## Date and time features

In [None]:
def add_datetime_features(df):
    df['month']   = df['time'].dt.month
    df['weekday'] = df['time'].dt.weekday
    df['day']     = df['time'].dt.day
    df['hour']    = df['time'].dt.hour
    df['minute']  = df['time'].dt.minute
add_datetime_features(train)
add_datetime_features(test)

## Congestion Min, Max, Median

In [None]:
group = ['road', 'weekday', 'hour', 'minute']
congestion = train.groupby(group).congestion
def add_feature(feature, feature_name):
    feature = feature.rename(columns={'congestion': feature_name})
    return train.merge(feature, on=group, how='left'), test.merge(feature, on=group, how='left')
train, test = add_feature(pd.DataFrame(congestion.max().astype(int)).reset_index(), "min")
train, test = add_feature(pd.DataFrame(congestion.max().astype(int)).reset_index(), "max")
train, test = add_feature(pd.DataFrame(congestion.median().astype(int)).reset_index(), "median")

## Drop columns unwanted columns

In [None]:
train.drop(['month', 'day', 'weekday', 'hour', 'minute', 'time'], axis=1, inplace=True)
test.drop(['month', 'day', 'weekday', 'hour', 'minute', 'time'], axis=1, inplace=True)
train.head()

## Reduce memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
reduce_mem_usage(train)
reduce_mem_usage(test)

# Modelling

In [None]:
y = train.loc[:, 'congestion']
X = train.drop('congestion', axis=1)
test_X = test

model = CatBoostRegressor(silent=True)
model.fit(X, y)

train_predictions = pd.Series(model.predict(X), index=X.index)
test_predictions = pd.Series(model.predict(test_X), index=test_X.index)

# Submission

In [None]:
# Don't forget to round
# Credits to https://www.kaggle.com/ambrosm/tpsmar22-don-t-forget-to-round
sub["congestion"] = test_predictions.round().astype(int)
sub.to_csv('submission.csv', index=False)
sub