In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import optuna
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
train_file_name = '../input/tabular-playground-series-jul-2021/train.csv'
test_file_name = '../input/tabular-playground-series-jul-2021/test.csv'
sub_file_name = '../input/tabular-playground-series-jul-2021/sample_submission.csv'

In [None]:
base_train = pd.read_csv(train_file_name, index_col=0)
base_test = pd.read_csv(test_file_name, index_col=0)

In [None]:
X_train_full = base_train.copy()
X_test_full = base_test.copy()

# Data visualisation

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
corr = X_train_full.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap);

# Pairplot

In [None]:
sns.pairplot(X_train_full);

Let's try to make dependences more linear:
* Square 'sensor_2' feature
* Take logatithm from 'sensor_3'

In [None]:
X_train_full['sensor_2'] = np.power(X_train_full['sensor_2'], 2)
X_test_full['sensor_2'] = np.power(X_test_full['sensor_2'], 2)

X_train_full['sensor_3'] = np.log(X_train_full['sensor_3'])
X_test_full['sensor_3'] = np.log(X_test_full['sensor_3'])

Well, that's better. Or not?..

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10,5))
sns.scatterplot(data=X_train_full, x='target_benzene', y='sensor_2', ax=axes[0]);
sns.scatterplot(data=X_train_full, x='target_benzene', y='sensor_3', ax=axes[1]);

It looks like we have outliers there.

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10,5))
sns.scatterplot(data=X_train_full,  x='sensor_3', y='deg_C', ax=axes[0]);
sns.scatterplot(data=X_train_full, x='sensor_3', y='sensor_1', ax=axes[1]);

In [None]:
y_carbon_monoxide = X_train_full['target_carbon_monoxide']
y_benzene = X_train_full['target_benzene']
y_nitrogen_oxides = X_train_full['target_nitrogen_oxides']

targets = [y_carbon_monoxide, y_benzene, y_nitrogen_oxides]
target_name = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

X_train_full = X_train_full.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)


In [None]:
scaler = StandardScaler()
columns = X_train_full.columns

X_train_full_scaled = pd.DataFrame(scaler.fit_transform(X_train_full), columns=columns)
X_test_full_scaled = pd.DataFrame(scaler.transform(X_test_full), columns=columns)

# Using t-sne and agglomerative clustering to detect outliers

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(random_state=17)
X_train_tsne = tsne.fit_transform(X_train_full_scaled)

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(
    X_train_tsne[:, 0],
    X_train_tsne[:, 1],
    edgecolor="none",
    alpha=0.7,
    s=40,
    cmap=plt.cm.get_cmap("nipy_spectral", 10),
)
plt.colorbar()
plt.title("MNIST. t-SNE projection");

In [None]:
from sklearn.cluster import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=2, linkage='single')
train_anomaly = model.fit_predict(X_train_tsne)

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(
    X_train_tsne[:, 0],
    X_train_tsne[:, 1],
    c=train_anomaly,
    edgecolor="none",
    alpha=0.7,
    s=40,
    cmap=plt.cm.get_cmap("nipy_spectral", 10),
)
plt.colorbar()
plt.title("MNIST. t-SNE projection");

In [None]:
X_train_full_scaled['anomaly'] = train_anomaly == 1

sns.set(style="ticks", color_codes=True)
sns.pairplot(X_train_full_scaled, hue = "anomaly");

In [None]:
X_test_tsne = tsne.fit_transform(X_test_full_scaled)

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(
    X_test_tsne[:, 0],
    X_test_tsne[:, 1],
    edgecolor="none",
    alpha=0.7,
    s=40,
    cmap=plt.cm.get_cmap("nipy_spectral", 10),
)
plt.colorbar()
plt.title("MNIST. t-SNE projection");

In [None]:
model = AgglomerativeClustering(n_clusters=2, linkage='single')
test_anomaly = model.fit_predict(X_test_tsne)

In [None]:
plt.figure(figsize=(12, 10))
plt.scatter(
    X_test_tsne[:, 0],
    X_test_tsne[:, 1],
    edgecolor="none",
    c=test_anomaly,
    alpha=0.7,
    s=40,
    cmap=plt.cm.get_cmap("nipy_spectral", 10),
)
plt.colorbar()
plt.title("MNIST. t-SNE projection");

In [None]:
X_test_full_scaled['anomaly'] = test_anomaly == 1

# Let's add some date features

In [None]:
date_train = pd.to_datetime(pd.Series(X_train_full.index))
date_test = pd.to_datetime(pd.Series(X_test_full.index))

X_train_full_scaled['year'] = date_train.dt.year.to_list()
X_test_full_scaled['year'] = date_test.dt.year.to_list()

X_train_full_scaled['month'] = date_train.dt.month.to_list()
X_test_full_scaled['month'] = date_test.dt.month.to_list()

X_train_full_scaled['day'] = date_train.dt.day.to_list()
X_test_full_scaled['day'] = date_test.dt.day.to_list()


X_train_full_scaled['hour'] = date_train.dt.hour.to_list()
X_test_full_scaled['hour'] = date_test.dt.hour.to_list()

### Optuna

In [None]:
def objective(trial,X,y):
    
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 101)

    param = {
                "n_estimators" : trial.suggest_int('n_estimators', 400, 2000),
                'max_depth':trial.suggest_int('max_depth', 0, 15),
                'reg_alpha':trial.suggest_loguniform('reg_alpha', 0.01, 10),
                'reg_lambda':trial.suggest_loguniform('reg_lambda', 0.01, 10),
                'min_child_weight':trial.suggest_loguniform('min_child_weight', 0.01, 10),
                'gamma':trial.suggest_loguniform('gamma', 0.01, 15),
                'learning_rate':trial.suggest_loguniform('learning_rate', 0.003, 0.01),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
                'nthread' : -1
            }
    model = XGBRegressor()
    model.set_params(**param)

    model.fit(train_X,train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=0)
    y_preds = model.predict(test_X)
    y_preds[y_preds < 0] = 0
    return np.sqrt(mean_squared_log_error(test_y, y_preds))

In [None]:
%time
best_params = []
for y_temp in targets:
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial : objective(trial, X_train_full_scaled, y_temp), n_trials = 5)
    best_params.append(study.best_params)

### Predictions

In [None]:
predictions = []
for params, y_temp in zip(best_params, targets):
    xgb_model = XGBRegressor(n_jobs=-1)
    xgb_model.set_params(**params)
    xgb_model.fit(X_train_full_scaled, y_temp)
    preds = xgb_model.predict(X_test_full_scaled)
    predictions.append(preds)

In [None]:
sub = pd.read_csv(sub_file_name)

sub['target_carbon_monoxide'] = predictions[0]
sub['target_benzene'] = predictions[1]
sub['target_nitrogen_oxides'] = predictions[2]

In [None]:
sub.to_csv('prediction.csv', index=False)