# **TPS - July 2021**

## **[Check Here](https://www.kaggle.com/junhyeok99/eda-leaked-data) For Leaked Data EDA !!**

## **Library Import**

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## **DATA LOAD**

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test

In [None]:
l_data = pd.read_excel('../input/air-quality-time-series-data-uci/AirQualityUCI.xlsx')
l_data.drop(columns = ['NMHC(GT)', 'NO2(GT)'], inplace = True)

# Preprocessing Time Column
l_data['hour'] = 0
for i in range(l_data.shape[0]):
  l_data['hour'][i] = l_data['Time'][i].hour

time_se = l_data['Date'].dt.date - l_data['Date'].dt.date.min()

# Making DataFrame to concat with train data!
leak = pd.DataFrame({
    'deg_C' : l_data['T'],
    'relative_humidity' : l_data['RH'],
    'absolute_humidity' : l_data['AH'],
    'sensor_1' : l_data['PT08.S1(CO)'],
    'sensor_2' : l_data['PT08.S2(NMHC)'],
    'sensor_3' : l_data['PT08.S3(NOx)'],
    'sensor_4' : l_data['PT08.S4(NO2)'],
    'sensor_5' : l_data['PT08.S5(O3)'],
    'target_carbon_monoxide' : l_data['CO(GT)'],
    'target_benzene' : l_data['C6H6(GT)'],
    'target_nitrogen_oxides' : l_data['NOx(GT)'],
    'year' : l_data['Date'].dt.year,
    'month' : l_data['Date'].dt.month,
    'week' : l_data['Date'].dt.week,
    'day' : l_data['Date'].dt.day,
    'dayofweek' : l_data['Date'].dt.dayofweek,
    'time' : time_se,
    'hour' : l_data['hour'],
    'working_hours' : l_data['hour'].isin(np.arange(8, 21, 1)).astype("int"),
    'is_weekend' : (l_data["Date"].dt.dayofweek >= 5).astype("int")
})
leak['time'] = leak['time'].apply(lambda x : x.days)
leak

In [None]:
all_data = pd.concat([train, test])
all_data

## **Data Preprocessing**

*   There are only numeric columns
*   Maybe need to use linear regression!!



### **Datetime Preprocessing**

In [None]:
all_data.info()

In [None]:
all_data['date_time'] = pd.to_datetime(all_data['date_time'])
all_data['year'] = all_data['date_time'].dt.year
all_data['month'] = all_data['date_time'].dt.month
all_data['week'] = all_data['date_time'].dt.week
all_data['day'] = all_data['date_time'].dt.day
all_data['dayofweek'] = all_data['date_time'].dt.dayofweek
all_data['time'] = all_data['date_time'].dt.date - all_data['date_time'].dt.date.min()
all_data['hour'] = all_data['date_time'].dt.hour
all_data['time'] = all_data['time'].apply(lambda x : x.days)
# all_data["is_winter"] = all_data["month"].isin([1, 2, 12])
# all_data["is_sprint"] = all_data["month"].isin([3, 4, 5])
# all_data["is_summer"] = all_data["month"].isin([6, 7, 8])
# all_data["is_autumn"] = all_data["month"].isin([9, 10, 11])
all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")
all_data.drop(columns = 'date_time', inplace = True)
all_data

In [None]:
#Outliers Preprocessing
leak = leak.reset_index(drop = True)
all_data = all_data.reset_index(drop = True)

out_index = []

for col in leak.columns[:8]:
    out_index.append(leak[leak[col] == -200].index)
    
for i in range(8):
    leak.loc[out_index[i], leak.columns[i]] = all_data.loc[out_index[i], leak.columns[i]]
    
print('done!')

In [None]:
all_data['dayofweek'] = all_data['dayofweek'].astype(object)
# all_data['month_c'] = all_data['month'].astype(object)
# all_data['hour_c'] = all_data['hour'].astype(object)
leak['dayofweek'] = leak['dayofweek'].astype(object)

In [None]:
all_data = pd.get_dummies(all_data)
leak = pd.get_dummies(leak)

In [None]:
all_data['SMC'] = (all_data['absolute_humidity'] * 100) / all_data['relative_humidity']
all_data['Dew_Point'] = 243.12*(np.log(all_data['relative_humidity'] * 0.01) + (17.62 * all_data['deg_C'])/(243.12+all_data['deg_C']))/(17.62-(np.log(all_data['relative_humidity'] * 0.01)+17.62*all_data['deg_C']/(243.12+all_data['deg_C'])))

leak['SMC'] = (leak['absolute_humidity'] * 100) / leak['relative_humidity']
leak['Dew_Point'] = 243.12*(np.log(leak['relative_humidity'] * 0.01) + (17.62 * leak['deg_C'])/(243.12+leak['deg_C']))/(17.62-(np.log(leak['relative_humidity'] * 0.01)+17.62*leak['deg_C']/(243.12+leak['deg_C'])))

In [None]:
train2 = all_data[:len(train)]
test2 = all_data[len(train):].reset_index(drop = True)

train2 = pd.concat([train2, leak]).reset_index()
train2.drop(columns = 'index', inplace = True)

### **Scaling**

#### **Log Scaling - Target values are skewed**

#### **Scaler**

In [None]:
def log_scaling(col):
  col = np.log1p(col)
  return col

In [None]:
cols = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
for col in cols:
  train2[col] = log_scaling(train2[col])

#### **Compare with Visualization**

In [None]:
fig, ax = plt.subplots(len(cols), 2, figsize=(12,12))
n = 0
for i in cols:
  sns.histplot(train[i], ax=ax[n, 0]);
  sns.histplot(train2[i], ax = ax[n, 1]);
  n += 1

fig.tight_layout()
plt.show()

### **Split DataSets**

In [None]:
train_3 = train2.drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
test_3 = test2.drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])

train_co = train2.drop(columns = ['target_benzene', 'target_nitrogen_oxides'])
train_be = train2.drop(columns = ['target_carbon_monoxide', 'target_nitrogen_oxides'])
train_no = train2.drop(columns = ['target_carbon_monoxide', 'target_benzene'])

## **Modeling**

### **Pycaret**

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import setup, compare_models, blend_models, finalize_model, predict_model, ensemble_model, create_model

#### **Model**

In [None]:
def pycaret_model(train, target, test, n_select, fold, opt, exclude):
  print('Setup Your Data....')
  setup(data=train,
        target=target,
        normalize = True,
        numeric_imputation = 'mean',
        silent= True)
  
  print('Comparing Models....')
  best = compare_models(sort=opt, n_select=n_select, fold = fold, exclude = exclude)

  print('Blending Models....')
  blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
  pred_holdout = predict_model(blended)

  print('Finallizing Models....')
  final_model = finalize_model(blended)

  print('Done...!!!')

  pred_esb = predict_model(final_model, test)
  re = pred_esb['Label']

  return re

#### **Predict Result**

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub['target_carbon_monoxide'] = np.exp(pycaret_model(train_co, 'target_carbon_monoxide', test_3, 3, 3, 'RMSLE', ['knn', 'xgboost']))-1
sub['target_benzene'] = np.exp(pycaret_model(train_be, 'target_benzene', test_3, 3, 3, 'RMSLE', ['knn', 'xgboost']))-1
sub['target_nitrogen_oxides'] = np.exp(pycaret_model(train_no, 'target_nitrogen_oxides', test_3, 3, 3, 'RMSLE', ['xgboost'])) - 1

In [None]:
sub

In [None]:
leak_sub = pd.read_excel('../input/air-quality-time-series-data-uci/AirQualityUCI.xlsx')[7110:].reset_index(drop = True)
leak_sub

In [None]:
co_out = leak_sub[leak_sub['CO(GT)'] == -200].index
be_out = leak_sub[leak_sub['C6H6(GT)'] == -200].index
ni_out = leak_sub[leak_sub['NOx(GT)'] == -200].index

leak_sub.loc[co_out, 'CO(GT)'] = sub.loc[co_out, 'target_carbon_monoxide']
leak_sub.loc[be_out, 'C6H6(GT)'] = sub.loc[be_out, 'target_benzene']
leak_sub.loc[ni_out, 'NOx(GT)'] = sub.loc[ni_out, 'target_nitrogen_oxides']

sub['target_carbon_monoxide'] = leak_sub['CO(GT)']
sub['target_benzene'] = leak_sub['C6H6(GT)']
sub['target_nitrogen_oxides'] = leak_sub['NOx(GT)']

sub

In [None]:
sub.to_csv('sub.csv', index = 0)