In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
import xgboost as xgb
from xgboost import XGBRFRegressor, XGBRegressor
%matplotlib inline

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", index_col=0)
test_df = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv", index_col=0)
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)

In [None]:
train_df.isna().sum().sum(), test_df.isna().sum().sum()

In [None]:
train_df.duplicated().sum(), test_df.duplicated().sum()

In [None]:
train_df.head()

In [None]:
# using x, y, direction to create roads
train_df['road'] = train_df['x'].astype(str) + train_df['y'].astype(str) + train_df['direction'].astype(str)
test_df['road'] = test_df['x'].astype(str) + test_df['y'].astype(str) + test_df['direction'].astype(str)

train_df.drop(['x', 'y', 'direction'], axis=1, inplace=True)
test_df.drop(['x', 'y', 'direction'], axis=1, inplace=True)

In [None]:
# Feature engineering of the time data
def add_datetime_features(df):
    # first convert the column into datetime
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['weekend'] = (df['time'].dt.weekday >= 5).astype(int)
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['afternoon'] = (df['time'].dt.hour >= 12).astype(int)
    
    # number of 20' periods in a day
    df['moment'] = df['time'].dt.hour * 3 + df['time'].dt.minute // 20

In [None]:
add_datetime_features(train_df)
add_datetime_features(test_df)

In [None]:
# Morning rush hour
mornings_df = train_df[(train_df.hour >= 6) & (train_df.hour < 12)]
morning_avgs = pd.DataFrame(mornings_df.groupby(['month', 'day', 'road'])['congestion'].median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={'congestion': 'morning_avgs'})
morning_medians = pd.DataFrame(mornings_df.groupby(['month', 'day', 'road'])['congestion'].median().astype(int)).reset_index()
morning_medians = morning_avgs.rename(columns={'congestion': 'morning_medians'})

In [None]:
# Adding the new features to the train and test datasets
train_df = train_df.merge(morning_avgs, on=['month', 'day', 'road'], how='left')
test_df = test_df.merge(morning_avgs, on=['month', 'day', 'road'], how='left')

In [None]:
# Congestion, min, max, median
mins = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute'])['congestion'].min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion': 'mins'})
maxs = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute'])['congestion'].max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion': 'maxs'})
medians = pd.DataFrame(train_df.groupby(['road', 'weekday', 'hour', 'minute'])['congestion'].median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion': 'medians'})

In [None]:
train_df = train_df.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
train_df = train_df.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
train_df = train_df.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(mins, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(maxs, on=['road', 'weekday', 'hour', 'minute'], how='left')
test_df = test_df.merge(medians, on=['road', 'weekday', 'hour', 'minute'], how='left')

In [None]:
train_df.head()

In [None]:
# separating the columns and features
# also dropping the time column as well.
features = train_df.iloc[:, 2:].columns.tolist()
target = 'congestion'

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_cols = train_df[features].select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.fit_transform(test_df[col])

In [None]:
# Creating a data frame to show all the correlations of the features with the target
corr_df = pd.DataFrame({'feature': features, 
              'correlation': train_df[features].corrwith(train_df[target]).values}
            ).sort_values(by='correlation', ascending=False)

corr_df

In [None]:
importance_features = ['medians', 'mins', 'morning_avgs', 'maxs', 'afternoon']

In [None]:
# Make copies of the training and test datasets
X_tr = train_df.copy()
X_te = test_df.copy()

y_pca = X_tr.pop('congestion')
X_tr = X_tr.loc[:, importance_features]
X_te = X_te.loc[:, importance_features]

# Using Principal component analysis to discover new features
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_tr)
X_te_pca = pca.fit_transform(X_te)

# convert to dataframes
component_names = [f"PCA{i+1}" for i in range(X_pca.shape[1])]

X_pca = pd.DataFrame(X_pca, columns=component_names)
X_te_pca = pd.DataFrame(X_te_pca, columns=component_names)

# adding the new features to the train and test sets
train_df = pd.concat([train_df, X_pca], axis=1)
test_df = pd.concat([test_df, X_te_pca], axis=1)

In [None]:
# Checking the correlation of the newly created features
pd.DataFrame(X_pca).corrwith(train_df[target])

In [None]:
new_features = [*features, *component_names]

In [None]:
#updating the correlation dataframe
# Creating a data frame to show all the correlations of the features with the target
corr_df = pd.DataFrame({'feature': new_features, 
              'correlation': np.abs(train_df[new_features].corrwith(train_df[target]).values)}
            ).sort_values(by='correlation', ascending=False)

corr_df

In [None]:
# select feature with absolute correlation of more than 0.05
features = corr_df[corr_df['correlation']>0.008]['feature'].values.tolist()

In [None]:
# Reduce the memory
def reduce_memory(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum()/1024**2
    
    for col in df.columns:
        col_dtype = df[col].dtypes
        
        if col_dtype in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_dtype)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum()/1024**2
    if verbose:
        print("Mem decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}%) reduction".format(
            start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
        
    return df

In [None]:
train_df = reduce_memory(train_df)
test_df = reduce_memory(test_df)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

scaler = MinMaxScaler()
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.fit_transform(test_df[features])

In [None]:
train_df[features].head()

In [None]:
xgb_params = {
    'eta': 1e-3,
    'objective': 'reg:pseudohubererror',
    'eval_metric': 'mae',
    'seed': 1223,
    'nthread': 4
}

In [None]:
# Using Kfold and Stratified  cross validation
# using xgboost random forest instead of just random forests
N_SPLITS = 10
N_REPEATS = 3
X, y = train_df[features], train_df[target]

# clf = XGBRFRegressor()

kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=1223)
strfolds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=1223)
repeats = RepeatedStratifiedKFold(n_repeats=N_REPEATS, n_splits=N_SPLITS, random_state=1223)
xgb_preds = []
for fold, (tr_idx, val_idx) in enumerate(tqdm(strfolds.split(X, y), total=N_SPLITS)):
    x_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    x_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb_model = XGBRegressor(random_state=1223).fit(x_train, y_train)
    
    preds_0 = xgb_model.predict(x_train)
    preds_1 = xgb_model.predict(x_valid)
#     print(f"{fold+1}: validation 0: mse: {mean_squared_error(preds_0, y_val)}")
    print(f"{fold+1}:Training mse: {mean_squared_error(preds_0, y_train):.4f}\tvalidation mse: {mean_squared_error(preds_1, y_valid):.4f}")
    xgb_preds.append(xgb_model.predict(test_df[features]))

In [None]:
predictions = np.round(sum(xgb_preds)/N_SPLITS, 0)

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
sample['congestion'] = predictions

In [None]:
sample.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').head()