In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn import svm
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = np.abs(mutual_info_regression(X, y))
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

# A little bit of Feature Engineering.

In [None]:
N_FOLDS = 3
N_REPEATS = 5
TARGET_VARS = ['target_carbon_monoxide',
               'target_benzene',
               'target_nitrogen_oxides']

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
test_data  = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
train_data.head()

In [None]:
y_carbon  = train_data['target_carbon_monoxide']                                           
y_benzene = train_data['target_benzene']
y_nitrogen= train_data['target_nitrogen_oxides']
y_train   = train_data[TARGET_VARS]

In [None]:
X_train   = train_data.drop(['date_time','target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)
features_to_check = ['deg_C', 'relative_humidity', 'absolute_humidity']
target_features   = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
sns.relplot(
    x="value", y="target_carbon_monoxide", col="variable", data=train_data.melt(id_vars="target_carbon_monoxide", value_vars=features_to_check), facet_kws=dict(sharex=False),
);

In [None]:
sns.relplot(
    x="value", y="target_benzene", col="variable", data=train_data.melt(id_vars="target_benzene", value_vars=features_to_check), facet_kws=dict(sharex=False),
);

In [None]:
sns.relplot(
    x="value", y="target_nitrogen_oxides", col="variable", data=train_data.melt(id_vars="target_nitrogen_oxides", value_vars=features_to_check), facet_kws=dict(sharex=False),
);

In [None]:
features_sensors = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']

sns.relplot(
    x="value", y="target_carbon_monoxide", col="variable", data=train_data.melt(id_vars="target_carbon_monoxide", value_vars=features_sensors), facet_kws=dict(sharex=False),
);

In [None]:
sns.relplot(
    x="value", y="target_benzene", col="variable", data=train_data.melt(id_vars="target_benzene", value_vars=features_sensors), facet_kws=dict(sharex=False),
);

In [None]:
sns.relplot(
    x="value", y="target_nitrogen_oxides", col="variable", data=train_data.melt(id_vars="target_nitrogen_oxides", value_vars=features_sensors), facet_kws=dict(sharex=False),
);

In [None]:
pd.DataFrame.corr(train_data[target_features])

In [None]:
X_train

In [None]:
pd.DataFrame.corr(train_data)

We see high level of correlation between target features and sensors.

In [None]:
pd.DataFrame(make_mi_scores(X_train, y_carbon))

In [None]:
pd.DataFrame(make_mi_scores(X_train, y_benzene))

In [None]:
pd.DataFrame(make_mi_scores(X_train, y_nitrogen))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_carbon))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_benzene))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_nitrogen))

Looks like sensor_2 has the most important value. What does it give us?

In [None]:
sns.histplot(train_data['sensor_2'])

In [None]:
date_time = pd.to_datetime(train_data['date_time'])
date = pd.DataFrame(date_time.dt.month)
day  = pd.DataFrame(date_time.dt.day)
time = pd.DataFrame(date_time.dt.hour)

In [None]:
def date_time_encoding(f_name, f_itself, max_val, key):
    if (key == 'test'):
        test_data['sin_' + f_name] = np.sin(2 * np.pi * (f_itself/max_val))
        test_data['cos_' + f_name] = np.cos(2 * np.pi * (f_itself/max_val))
        test_data['tan_' + f_name] = np.tan(2 * np.pi * (f_itself/max_val))
        #test_data['sinh_' + f_name]= np.sinh(2 * np.pi * (f_itself/max_val))
        #test_data['cosh_' + f_name]= np.cosh(2 * np.pi * (f_itself/max_val))
    if (key == 'train'):
        X_train['sin_' + f_name] = np.sin(2 * np.pi * (f_itself/max_val))
        X_train['cos_' + f_name] = np.cos(2 * np.pi * (f_itself/max_val))
        X_train['tan_' + f_name] = np.tan(2 * np.pi * (f_itself/max_val))
        #X_train['sinh_' + f_name]= np.sinh(2 * np.pi * (f_itself/max_val))
        #X_train['cosh_' + f_name]= np.cosh(2 * np.pi * (f_itself/max_val))
    return 0

In [None]:
date_time_encoding('time', time, 24, 'train')
date_time_encoding('date', date, 12, 'train')
date_time_encoding('day' , day,  31, 'train')

In [None]:
X_train

In [None]:
pd.DataFrame.corr(X_train.join(train_data[target_features]))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_carbon))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_benzene))

In [None]:
plot_mi_scores(make_mi_scores(X_train, y_nitrogen))

# Ok. Lets train the simplest model (Catboost).

In [None]:
#catb_est = CatBoostRegressor(silent=True)
catb_carbon   = CatBoostRegressor(silent=True)
catb_benzene  = CatBoostRegressor(silent=True)
catb_nitrogen = CatBoostRegressor(silent=True)
#rf_est = RandomForestRegressor()

In [None]:
#pars = {#'estimator__learning_rate': [.01, .05, .1],
        #'estimator__max_depth': [3, 5],
#        'estimator__subsample': [.5, .75, 1.]
#        'estimator__n_estimators': [500]
#}
#cv_pars = RepeatedKFold(n_splits=N_FOLDS, n_repeats=N_REPEATS)

#Build and initialize CV
#cv_model = MultiOutputRegressor(catb_est)
#crossval = GridSearchCV(cv_model, pars, scoring='neg_mean_squared_error', cv=cv_pars)

In [None]:
%%time
#crossval.fit(X_train, y_train)
#cv_model.fit(X_train, y_train)

In [None]:
%%time
catb_carbon.fit(X_train, y_carbon)
catb_benzene.fit(X_train, y_benzene)
catb_nitrogen.fit(X_train, y_nitrogen)

In [None]:
date_time = pd.to_datetime(test_data['date_time'])
date = pd.DataFrame(date_time.dt.month)
day  = pd.DataFrame(date_time.dt.day)
time = pd.DataFrame(date_time.dt.hour)

In [None]:
date_time_encoding('time', time, 24, 'test')
date_time_encoding('date', date, 12, 'test')
date_time_encoding('day', day, 31,  'test')

In [None]:
submission['target_carbon_monoxide'] = catb_carbon.predict(test_data.drop("date_time", axis=1))
submission['target_benzene']         = catb_benzene.predict(test_data.drop("date_time", axis=1))
submission['target_nitrogen_oxides'] = catb_nitrogen.predict(test_data.drop("date_time", axis=1))

In [None]:
submission.to_csv("catboost_subm.csv", index=False)

In [None]:
submission

# Later I will try some stacking.