In [None]:
#Packages

import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

import dill # for serializing the pipeline and the trained model

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
train.head()

In [None]:
X = train.drop(["congestion"], axis=1)
y = train["congestion"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
dir_dict = {'EB': [1,0], 
              'NB': [0,1], 
              'SB': [0,-1], 
              'WB': [-1,0], 
              'NE': [1,1], 
              'SW': [-1,-1], 
              'NW': [-1,1], 
              'SE': [1,-1]}

def feature_engineering(data):
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['converted_direction_coord_0'] = data['direction'].map(lambda x: dir_dict[x][0])
    data['converted_direction_coord_1'] = data['direction'].map(lambda x: dir_dict[x][1])
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['hour+minute'] = data['time'].dt.hour * 60 + data['time'].dt.minute
    data['is_weekend'] = (data['time'].dt.dayofweek > 4).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['x+y'] = data['x'].astype('str') + data['y'].astype('str')
    data['x+y+direction'] = data['x'].astype('str') + data['y'].astype('str') + data['direction'].astype('str')
    data['x+y+direction0'] = data['x'].astype('str') + data['y'].astype('str') + data['converted_direction_coord_0'].astype('str')
    #data['x+y+direction1'] = data['x'].astype('str') + data['y'].astype('str') + data['converted_direction_coord_1'].astype('str')
    #data['hour+direction'] = data['hour'].astype('str') + data['direction'].astype('str')
    #data['hour+x+y'] = data['hour'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    #data['hour+direction+x'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str')
    #data['hour+direction+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['y'].astype('str')
    #data['hour+direction+x+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    #data['hour+x'] = data['hour'].astype('str') + data['x'].astype('str')
    #data['hour+y'] = data['hour'].astype('str') + data['y'].astype('str')
    
    return data


def preprocess_customer_metrics( df ):
    
    df_cleaned = feature_engineering(df)
    
    drop_list = ["time"]
    df_cleaned = df_cleaned.drop(columns=drop_list, axis = 1)
    
    num_attribs = list(df_cleaned.select_dtypes(include=np.number).columns)
    cat_attribs = list(df_cleaned.select_dtypes(include=np.object_).columns)

    class DataFrameSelector(BaseEstimator, TransformerMixin):
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return X[self.attribute_names].values

    num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs))
    ])
    cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown="ignore")),
    ])
    full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ])

    X = full_pipeline.fit_transform(df_cleaned)

    return X, full_pipeline

def save_pipeline_to_disk( full_pipeline, path ):
    with open( path, 'wb') as pickle_file:
        dill.dump( full_pipeline, pickle_file)
        
def preprocess_validation_set( val, full_pipeline ):
    
    val = feature_engineering(val)
    
    drop_list = ["time"]
    X_val = val.drop(columns=drop_list, axis = 1)
    X_val = full_pipeline.transform( X_val )

    return X_val


dir_dict = {'EB': [1,0], 
              'NB': [0,1], 
              'SB': [0,-1], 
              'WB': [-1,0], 
              'NE': [1,1], 
              'SW': [-1,-1], 
              'NW': [-1,1], 
              'SE': [1,-1]}



In [None]:
X_train, full_pipeline = preprocess_customer_metrics( X_train )

In [None]:
pipelines_folder = "./"
save_pipeline_to_disk(full_pipeline, f'{pipelines_folder}/pipeline.pickle')

In [None]:
X_val = preprocess_validation_set(X_val, full_pipeline)

In [None]:
!nvidia-smi

In [None]:
model = XGBRegressor(n_estimators=10000, max_depth=6, tree_method = "gpu_hist")
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
eval_set = [(X_val, y_val)]

model.fit(X_train, y_train, eval_metric="mae", eval_set=eval_set, verbose=True, early_stopping_rounds=50)

In [None]:
def transform_data( df, full_pipeline):
    df  = feature_engineering(df)
    data_to_predict = full_pipeline.transform(df)
    return data_to_predict

In [None]:
X_test = transform_data(test, full_pipeline)

In [None]:
test_pred = model.predict(X_test)
print(f'Prediction for test data:\n{test_pred}\nShape = {test_pred.shape}')

In [None]:
test_pred

In [None]:
submission['congestion'] = test_pred
submission.to_csv('LightAutoML_TabularAutoML.csv', index=False)
submission

In [None]:
submission.describe()