In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
plt.style.use('seaborn')

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print('Number of train samples %d'%train_data.shape[0])
print('Number of train fetures %d'%train_data.shape[1])
print('Number of test samples %d'%test_data.shape[0])
print('Number of test fetures %d'%test_data.shape[1])

In [None]:
train_data['date_time'] = pd.to_datetime(train_data['date_time'])
train_data.set_index('date_time', inplace=True)
test_data['date_time'] = pd.to_datetime(test_data['date_time'])
test_data.set_index('date_time', inplace=True)

Let's visualize a few samples of the data

In [None]:
ax = train_data.iloc[:30,:].plot(title = 'Time_Series_Plot', figsize = (30, 30))
ax.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
train_features = train_data[list(test_data.columns)]
train_targets = train_data[list(set(test_data.columns)-set(test_data.columns))]

 # Count of NA Values

In [None]:
print(" \nCount total NaN at each column in a DataFrame : \n\n",train_features.isnull().sum())

# Correlation HeatMap Between Features

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(train_data.corr(),annot = True, cmap = 'terrain')

* absolute value of correlation between sensor_2,senor_3 is high ( >0.8)
* absolute value of correlation between sensor_3,senor_4 and sensor_3,sensor_5 is moderately high (0.71, 0.74)
* target_carbon_monoxide is highly correlated with sensor_5, sensor_2, sensor_1
* target_benzne is highly correlated with sensor_5, sensor_2, sensor_1; moderately correlated with sensor_3, sensor_4
* target_nitrogen_oxides is  moderately correlated with sensor_5





# Skewness Barplot

In [None]:
skewness_data = train_data.skew(axis=0).to_frame().T
plt.figure(figsize = (10,10))
sns.barplot(y = skewness_data.columns,x = skewness_data.values.reshape(-1,))


* sensor_3 has a skewness value > 1 among features
* all targets have skewness values > 1

In [None]:
sns.distplot(x= train_features['sensor_3'])

In [None]:
sns.distplot(x= train_data['target_carbon_monoxide'])

In [None]:
sns.distplot(x= train_data['target_benzene'])

In [None]:
sns.distplot(x= train_data['target_nitrogen_oxides'])

# Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
train_x=train_features
vif_data = pd.DataFrame()
vif_data["feature"] = train_x.columns
vif_data["VIF"] = [variance_inflation_factor(train_x.values, i) for i in tqdm_notebook(range(len(train_x.columns)))]

In [None]:
vif_data

* Surprisingly all the VIF values are very high - suggesting strong multicollinearity

# Train Boxplots

In [None]:
train_data.boxplot(column=list(train_features.columns),figsize=(15,15))

In [None]:
test_data.boxplot(column=list(train_features.columns),figsize=(15,15))

# XGBoost

In [None]:

from xgboost import XGBRegressor

from sklearn.metrics import accuracy_score
import pickle

# train_features.reset_index(inplace = True)
# train_data.reset_index(inplace = True)
target = train_data[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]


In [None]:

def RLMSE(y_true, y_pred):
    diff = np.log(y_pred +1) - np.log(y_true + 1)
    return np.sqrt(np.mean(np.power(diff, 2)))

In [None]:
from math import floor
from sklearn.metrics import mean_squared_error

x_train, x_val = train_features[: floor(0.8 * len(train_features))], train_features[floor(0.8 * len(train_features)):]
y_train, y_val = target[: floor(0.8 * len(target))], target[floor(0.8 * len(target)):]

model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

for tar in y_train.columns:
    print(f'Target = {tar}')
    model.fit(x_train, y_train[tar], eval_set=[(x_val, y_val[tar])], early_stopping_rounds = 10, verbose=10)
    y_pred = model.predict(x_val)
    mse = mean_squared_error(y_val[tar], y_pred)
    rlmse = RLMSE(y_val[tar], y_pred)

    pickle.dump(model, open(f"xg_model_{tar}.pickle.dat", "wb"))
    print(f'MSE={mse}, RLMSE={rlmse}')


In [None]:
xg_df=pd.DataFrame(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])

for tar in xg_df.columns:
    loaded_model = pickle.load(open(f"xg_model_{tar}.pickle.dat", "rb"))
    pr = loaded_model.predict(test_data)
    
    feat_imp = loaded_model.feature_importances_
    plt.figure(figsize = (5,5))
    ax = sns.barplot(y = test_data.columns, x = feat_imp)
    ax.set_title(f'Feature Importance for Target = {tar}')
    
    xg_df[tar] = pr



* sensor_2 data is playing the most crucial role for 1st two targets and for the last one sensor_3 data is most important

# Prophet

In [None]:
!pip install fbprophet

In [None]:
from fbprophet import Prophet
from copy import deepcopy
import fbprophet
import json
from fbprophet.serialize import model_to_json, model_from_json

In [None]:
for tar in target.columns:
    train_prophet = deepcopy(train_features)
    train_prophet['y'] = target[tar]
    train_prophet.reset_index(inplace = True)
    train_prophet.rename(columns = {'date_time':'ds'}, inplace = True)
#     print(train_prophet.head())
    
    model = Prophet(seasonality_mode = 'additive')
    for col in train_features.columns:
        model.add_regressor(col, mode='additive')
    model.fit(train_prophet)
    
    with open(f'model_{tar}.json', 'w') as fout:
        json.dump(model_to_json(model), fout)  # Save model

    

In [None]:
prophet_df=pd.DataFrame(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])

for tar in prophet_df.columns:
    
    with open(f'model_{tar}.json', 'r') as fin:
        loaded_model = model_from_json(json.load(fin))
    
    test_prophet = deepcopy(test_data)
    
    test_prophet.reset_index(inplace = True)
    test_prophet.rename(columns = {'date_time':'ds'}, inplace = True)

    forecast = loaded_model.predict(test_prophet)
    prophet_df[tar] = forecast['yhat']


    
    
    



In [None]:
prophet_df.head()

# CatBoost

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
x_train, x_val = train_features[: floor(0.8 * len(train_features))], train_features[floor(0.8 * len(train_features)):]
y_train, y_val = target[: floor(0.8 * len(target))], target[floor(0.8 * len(target)):]


In [None]:
cat_df=pd.DataFrame(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])

for tar in y_train.columns:
    
    print(f'Target = {tar}')
    
    train_pool = Pool(x_train, label=y_train[tar])
    
    val_pool = Pool(x_val, label=y_val[tar])
    
    catb_model = CatBoostRegressor(objective='RMSE', iterations = 2000, early_stopping_rounds=20, eval_metric = 'RMSE')
    catb_model.fit(train_pool, plot=True, eval_set=val_pool, verbose=500)
    
    
    y_pred = catb_model.predict(val_pool)
    mse = mean_squared_error(y_val[tar], y_pred)
    
    
    rlmse = RLMSE(y_val[tar].values, y_pred)

    catb_model.save_model(f"catb_model_{tar}.bin")
    print(f'Val MSE={mse}, Val RLMSE={rlmse}')
    
    test_pool = Pool(test_data)
    
    feat_imp = np.array(catb_model.get_feature_importance(data=train_pool, type='LossFunctionChange', prettified=True))
    pr = catb_model.predict(test_pool)
    
    plt.figure(figsize = (7,7))
    ax = sns.barplot(y = feat_imp[:,0], x = feat_imp[:,1])
    ax.set_title(f'Feature Importance for Target : {tar}')
 
    
    cat_df[tar] = pr
    


In [None]:
cat_df.head()

# Ensemble

In [None]:
submission_df=pd.DataFrame(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])

submission_df = (xg_df + cat_df + prophet_df)/3

submission_df.set_index(test_data.index, inplace=True)

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv')