# Tabular Playground JULY 2021

The goal of competitions is to provide a fun, and approachable for anyone, tabular dataset.

## Evaluation Scheme
The RMSLE for a single column calculated as:

$$\sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 }$$

where:

$n$ is the total number of observations

$p_i$ is your prediction

$a_i$ is the actual value

$log(x)$ is the natural logarithm of 


# Importing Dependencies

- pandas : for csv reading and data analysis
- numpy :  for array manipulation
- matplotlib.pyplot : for plotting graphs
- os : os level commands
- seaborn : for better looking plots

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [None]:
# path to dataset folder
dataset_path = "../input/tabular-playground-series-jul-2021"

In [None]:
os.listdir(dataset_path)

# Basic Data Analysis

In [None]:
train_csv = pd.read_csv(os.path.join(dataset_path, "train.csv"))
train_csv.head()

In [None]:
test_csv = pd.read_csv(os.path.join(dataset_path, "test.csv"))
test_csv.head()

In [None]:
submission_csv = pd.read_csv(os.path.join(dataset_path, "sample_submission.csv"))
submission_csv.head()

In [None]:
# train data shape
train_csv.shape

In [None]:
# check if any value in dataframe is null
train_csv.isnull().sum()

In [None]:
train_csv.describe()

In [None]:
train_csv.columns

In [None]:
print("Range of datetime in training data")
print(train_csv["date_time"].min())
print("to")
print(train_csv["date_time"].max())

In [None]:
print("Range of datetime in test data")
print(test_csv["date_time"].min())
print("to")
print(test_csv["date_time"].max())

# Data Visualization
- plot time series targets values
- plot time series sensor data

### Plotting targets

In [None]:
train_csv.plot(x="date_time", y="target_carbon_monoxide", rot=50)

In [None]:
train_csv.plot(x="date_time", y="target_benzene", rot=50)

In [None]:
train_csv.plot(x="date_time", y="target_nitrogen_oxides", rot=50)

### Plotting sensor data

In [None]:
train_csv.plot(x="date_time", y="sensor_1", rot=50)

In [None]:
train_csv.plot(x="date_time", y="sensor_2", rot=50)

In [None]:
train_csv.plot(x="date_time", y="sensor_3", rot=50)

In [None]:
train_csv.plot(x="date_time", y="sensor_4", rot=50)

In [None]:
train_csv.plot(x="date_time", y="sensor_5", rot=50)

# Feature Engineering

- changing datatypes of date column to pd datetime
- convert datetime to features
- extracting phase of day with respect to time
- extracting season of the year using month
- ratio of relative humidity and temperature

In [None]:
# changing dtype of columns
def change_dtypes(df):
    df["date_time"] = pd.to_datetime(df["date_time"])

In [None]:
change_dtypes(train_csv)

In [None]:
# extracting features using datetime
def datetime2features(df):
    time_col = "date_time"
    df["year"] = df[time_col].dt.year
    df["month"] = df[time_col].dt.month
    df["day"] = df[time_col].dt.day
    df["hour"] = df[time_col].dt.hour
    df["dayofweek"] = df[time_col].dt.dayofweek
    df["year"] = df[time_col].dt.year
    df['weekend'] = df[time_col].dt.dayofweek.apply(lambda x: 1 if (x>4)  else 0)

In [None]:
'''
    which phase of day the time denotes [morning, afternoon, evening, night] 
'''
def time_phase(df):
    def which_phase(hour):
        if hour >= 0 and hour <= 5:
            return 1
        elif hour >=6 and hour <= 11:
            return 2
        elif hour >=12  and hour <= 17:
            return 3
        elif hour >=18 and hour <= 23:
            return 4
        return NaN 
    time_col = "date_time"
    df["phase"] = df[time_col].dt.hour.apply(lambda x : which_phase(x))

In [None]:
datetime2features(train_csv)
time_phase(train_csv)
train_csv.head()

In [None]:
train_csv.describe()

In [None]:
'''
    which season of year the time denotes [summer, rainy, winter] 
'''
def season(df):
    def which_season(month):
        if month >= 3 and month <= 6:
            return 1
        elif month >= 7 and month <= 9:
            return 2
        elif month >= 10  and month <= 12:
            return 3
        elif month < 3:
            return 3
        return NaN
    time_col = "date_time"
    df["season"] = df[time_col].dt.month.apply(lambda x : which_season(x))

In [None]:
season(train_csv)
train_csv.head()

In [None]:
train_csv.describe()

In [None]:
'''
ratio between relative humidity and temperature
'''
def ratio_rh_temp(df):
    df["r_rh_temp"] = df["relative_humidity"]/(df["deg_C"]+1e-9)

In [None]:
ratio_rh_temp(train_csv)
train_csv.head()

In [None]:
train_csv.describe()

### Ploting correlations between columns

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_csv.corr())

In [None]:
model_save_folder = "models"
csv_folder = "csv"

In [None]:
os.makedirs(model_save_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

In [None]:
train_csv.to_csv(os.path.join(csv_folder, "train_edit.csv"), index=False)

In [None]:
test_csv.describe()

In [None]:
change_dtypes(test_csv)
datetime2features(test_csv)
time_phase(test_csv)
season(test_csv)
ratio_rh_temp(test_csv)

In [None]:
test_csv.describe()

In [None]:
test_csv.to_csv(os.path.join(csv_folder, "test_edit.csv"), index=False)

# Model Training

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, LogisticRegression
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import sklearn.metrics as metrics
import sklearn.model_selection as ms
import pickle

In [None]:
# Cross validation utility
class CrossValidation:
    def __init__(self, df, shuffle,random_state=None):
        self.df = df
        self.random_state = random_state
        self.shuffle = shuffle
        if shuffle is True:
            self.df = df.sample(frac=1,
                random_state=self.random_state).reset_index(drop=True)
        if not shuffle:
            self.random_state = None

    def hold_out_split(self,percent,stratify=None):
        if stratify is not None:
            y = self.df[stratify]
            train,val = ms.train_test_split(self.df, test_size=percent/100,
                stratify=y, random_state=self.random_state)
            return train,val
        size = len(self.df) - int(len(self.df)*(percent/100))
        train = self.df.iloc[:size,:]
        val = self.df.iloc[size:,:]
        return train,val

    def kfold_split(self, splits, stratify=None):
        if stratify is not None:
            kf = ms.StratifiedKFold(n_splits=splits, 
                random_state=self.random_state)
            y = self.df[stratify]
            for train, val in kf.split(X=self.df,y=y):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v
        else:
            kf = ms.KFold(n_splits=splits, shuffle=self.shuffle,
                random_state=self.random_state)
            for train, val in kf.split(X=self.df):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v

In [None]:
# calculate rmsle of predicted data 
def mse(y_true, y_pred):
    return metrics.mean_squared_error(y_true, y_pred)

In [None]:
folds = 5
seed = 48

In [None]:
features_exclude = ["date_time"]
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
features = [col for col in train_csv.columns if col not in features_exclude+targets]
print(features)

### Splitting data into folds for cross validation

In [None]:
cv = CrossValidation(train_csv, shuffle=True, random_state=seed)

In [None]:
fold_models = {tar:[] for tar in targets}
print(fold_models)

# Training

In [None]:
m_rfg = RandomForestRegressor(n_estimators=100)
m_gb = GradientBoostingRegressor(n_estimators=100)
m_lgb = lgb.LGBMRegressor(seed=seed)
m_ctb = cbt.CatBoostRegressor(random_seed=seed, verbose=False)
m_xgb = xgb.XGBRegressor(random_state=seed)

learners = (m_rfg, m_gb ,m_lgb, m_ctb, m_xgb)

meta_model = BayesianRidge(normalize=True)

In [None]:
def train_step(X, Y, evalX, evalY, learners, meta_model, verbose=True):
    reg = StackingCVRegressor(regressors=learners, 
                            meta_regressor = meta_model,
                            n_jobs = -1,
                            verbose = int(verbose)
                           )
    trainX = X.values
    trainY = Y.values
    
    model = reg.fit(trainX, trainY)
    
    predY_train = model.predict(trainX)
    
    train_rmsle = mse(trainY, predY_train)
    train_r2 = metrics.r2_score(trainY, predY_train)
    
    if verbose:
        print("Training mse: ", train_rmsle)
        print("Training r2: ", train_r2)

    valX = evalX.values
    valY = evalY.values
    
    predY_val = model.predict(valX)
    
    val_rmsle = mse(valY, predY_val)
    val_r2 = metrics.r2_score(valY, predY_val)
    
    if verbose:
        print("Validation mse: ", val_rmsle)
        print("Validation r2: ", val_r2)
        
    return {"model": model,
            "train_scores":{"r2": train_r2, "mse": train_rmsle},
            "val_scores":{"r2": val_r2, "mse": val_rmsle}
           }

In [None]:
def train_folds(cv, feature_cols, target_col, num_folds, learners, meta_model,
                verbose=True):
    fold_train_rmsle = []
    fold_train_r2 = []
    fold_val_rmsle = [] 
    fold_val_r2 = []
    fold_models = []

    for fold, (train_, val_) in enumerate(cv.kfold_split(splits=num_folds)):
        result = train_step(X=train_[feature_cols],
                            Y=train_[target_col],
                            evalX=val_[feature_cols],
                            evalY=val_[target_col],
                            learners=learners,
                            meta_model=meta_model,
                            verbose=verbose
                           )
        fold_train_rmsle.append(result["train_scores"]["mse"])
        
        fold_train_r2.append(result["train_scores"]["r2"])

        fold_val_rmsle.append(result["val_scores"]["mse"])
        fold_val_r2.append(result["val_scores"]["r2"])

        fold_models.append(result["model"])
        
    return {"models":fold_models,
            "train_scores":{"r2": np.mean(fold_train_r2), "mse": np.mean(fold_train_rmsle)},
            "val_scores":{"r2":np.mean(fold_val_r2), "mse":np.mean(fold_val_rmsle)}
           }

### Carbon Monoxide

In [None]:
target = "target_carbon_monoxide"
results = train_folds(cv, features, target, folds, learners, meta_model)
fold_models[target] = results["models"]

print("="*50)
print("Training MSE: ", results["train_scores"]["mse"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSE: ", results["val_scores"]["mse"])
print("Validation R2: ", results["val_scores"]["r2"])

### Benzene

In [None]:
target = "target_benzene"
results = train_folds(cv, features, target, folds, learners, meta_model)
fold_models[target] = results["models"]
print("="*50)
print("Training MSE: ", results["train_scores"]["mse"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSE: ", results["val_scores"]["mse"])
print("Validation R2: ", results["val_scores"]["r2"])

### Nitrogen Oxides

In [None]:
target = "target_nitrogen_oxides"
results = train_folds(cv, features, target, folds, learners, meta_model)
fold_models[target] = results["models"]
print("="*50)
print("Training MSE: ", results["train_scores"]["mse"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSE: ", results["val_scores"]["mse"])
print("Validation R2: ", results["val_scores"]["r2"])

### Prediction blending from folds

In [None]:
def get_weights(predictions, targets, apply_softmax=True):
    def softmax(x):
        f_x = np.exp(x) / np.sum(np.exp(x))
        return f_x
    lnr = LinearRegression()
    lnr_model = lnr.fit(predictions, targets)
    if apply_softmax:
        return softmax(lnr_model.coef_)
    return lnr_model.coef_

In [None]:
def weighted_sum(predictions, weights):
    return np.dot(predictions, weights)

In [None]:
trainX = train_csv[features].values
trainY = train_csv[targets]

In [None]:
predictions = []

In [None]:
preds = []
for model in fold_models[targets[0]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)

weights_0 = get_weights(preds.transpose(), trainY[targets[0]].values)
print("Fold Predictions Weightings")
print(weights_0)

preds = weighted_sum(preds.transpose(), weights_0)
predictions.append(preds)

In [None]:
preds = []
for model in fold_models[targets[1]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)
weights_1 = get_weights(preds.transpose(), trainY[targets[1]].values)
print("Fold Predictions Weightings")
print(weights_1)

preds = weighted_sum(preds.transpose(), weights_1)
predictions.append(preds)

In [None]:
preds = []
for model in fold_models[targets[2]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)
weights_2 = get_weights(preds.transpose(), trainY[targets[2]].values)
print("Fold Predictions Weightings")
print(weights_2)

preds = weighted_sum(preds.transpose(), weights_2)
predictions.append(preds)

In [None]:
predictions = np.array(predictions).transpose()
print(predictions.shape)
print(trainY.shape)

In [None]:
predictions = np.where(predictions<0, 0, predictions)

In [None]:
print("R2 score: ", metrics.r2_score(trainY, predictions))
print("RMSLE score: ", np.sqrt(metrics.mean_squared_log_error(trainY, predictions)))

# Inference on test dataset

In [None]:
testX = test_csv[features].values

In [None]:
predictions = []

In [None]:
preds = []
for model in fold_models[targets[0]]:
    preds.append(model.predict(testX))

preds = np.array(preds)

preds = weighted_sum(preds.transpose(), weights_0)
predictions.append(preds)

In [None]:
preds = []
for model in fold_models[targets[1]]:
    preds.append(model.predict(testX))

preds = np.array(preds)

preds = weighted_sum(preds.transpose(), weights_1)
predictions.append(preds)

In [None]:
preds = []
for model in fold_models[targets[2]]:
    preds.append(model.predict(testX))

preds = np.array(preds)

preds = weighted_sum(preds.transpose(), weights_2)
predictions.append(preds)

In [None]:
predictions = np.array(predictions).transpose()

In [None]:
predictions.shape

In [None]:
predictions = np.where(predictions<0, 0, predictions)

In [None]:
submission_csv[targets] = predictions

In [None]:
submission_csv.to_csv("submission.csv", index=False)

In [None]:
submission_csv.head()

In [None]:
for key, models in fold_models.items():
    for fold, model in enumerate(models):
        with open(os.path.join(model_save_folder, f"{key}_fold_{fold+1}.pkl"), 'wb') as pckl:
            pickle.dump(model, pckl)

In [None]:
!ls models