In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(rc={'figure.figsize':(18,16)})

import matplotlib.pyplot as plt

In [None]:
include_seasons = False
Use_orig_data = True

##  Lets first load the dataset provided to us in the TPS july 2021 competition 

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", index_col=  'date_time' , parse_dates=True)
df.head()

# The Original dataset - Air Quality  -  [Link to dataset](https://www.kaggle.com/amritpal333/tps-july-2021-original-data)



The dataset contains 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005 (one year)representing the longest freely available recordings of on field deployed air quality chemical sensor devices responses. Ground Truth hourly averaged concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx) and Nitrogen Dioxide (NO2) and were provided by a co-located reference certified analyzer. Evidences of cross-sensitivities as well as both concept and sensor drifts are present as described in De Vito et al., Sens. And Act. B, Vol. 129,2,2008 (citation required) eventually affecting sensors concentration estimation capabilities. 

**Missing values are tagged with -200 value**
This dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.

### Attribute Information:

- 0 Date (DD/MM/YYYY)
- 1 Time (HH.MM.SS)
- 2 True hourly averaged concentration CO in mg/m^3 (reference analyzer)
- 3 PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
- 4 True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)
- 5 True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
- 6 PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
- 7 True hourly averaged NOx concentration in ppb (reference analyzer)
- 8 PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
- 9 True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
- 10 PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
- 11 PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
- 12 Temperature in Â°C
- 13 Relative Humidity (%)
- 14 AH Absolute Humidity 

In [None]:
mdf = pd.read_csv('../input/air-quality-tps-july-data/AirQualityUCI_with_missing_data.csv')
#mdf.info()
#mdf

new_df = pd.DataFrame(columns = df.columns)

#Date 	Time 	
new_df['target_carbon_monoxide'] = mdf['CO(GT)']	 	
#NMHC(GT) 	
new_df['target_benzene'] = mdf['C6H6(GT)'] 
new_df['target_nitrogen_oxides'] = mdf['NOx(GT)'] 	 	
#NO2(GT) 	

new_df['sensor_1'] = mdf['PT08.S1(CO)']      #Carbon monoxide
new_df['sensor_2'] = mdf['PT08.S2(NMHC)']    #Benzene
new_df['sensor_3'] = mdf['PT08.S3(NOx)'] 
new_df['sensor_4'] = mdf['PT08.S4(NO2)']  	
new_df['sensor_5'] = mdf['PT08.S5(O3)'] 
new_df['deg_C'] = mdf['T'].astype("float")
new_df['relative_humidity'] = mdf['RH']  	
new_df['absolute_humidity'] = mdf['AH'] 


new_df.head()

# Distribution of both the data

In [None]:
my_colors = ['#DC143C', '#FF1493', '#FF7F50', '#FFD700', '#32CD32', 
             '#4ddbff', '#1E90FF', '#663399', '#708090']

df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", parse_dates=True)
df = df[df.columns[1:]]
df

df.describe().T.style.background_gradient(subset = ['count'], cmap = 'viridis') \
    .bar(subset = ['mean', '50%'], color = my_colors[6]) \
    .bar(subset = ['std'], color = my_colors[0])

In [None]:
new_df.describe().T.style.background_gradient(subset = ['count'], cmap = 'viridis') \
    .bar(subset = ['mean', '50%'], color = my_colors[6]) \
    .bar(subset = ['std'], color = my_colors[0])

### As we can see, both of them have similar featutes!

# Histogram plots

In [None]:
fig = plt.figure(figsize = (20, 15))
fig.suptitle('TPS July 2021 - Provided data', size = 25, weight = 'bold')
for idx, i in enumerate(df.columns):
    fig.add_subplot(np.ceil(len(df.columns)/4), 4, idx+1)
    df.iloc[:, idx].hist(bins = 20)
    plt.title(i)
plt.show()

In [None]:
fig = plt.figure(figsize = (20, 15))
fig.suptitle('Air quality dataset - Original data', size = 25, weight = 'bold')
for idx, i in enumerate(new_df.columns):
    fig.add_subplot(np.ceil(len(new_df.columns)/4), 4, idx+1)
    new_df.iloc[:, idx].hist(bins = 20)
    plt.title(i)
plt.show()

### Yet again we find that both the datasets have exactly the same distribution.

# Heatmaps

In [None]:
sns.heatmap(df.corr() , annot = True )

In [None]:
sns.heatmap(new_df.corr() , annot = True)

# Feature engineering

In [None]:
sns.set_theme(style="dark")

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", index_col=  'date_time' , parse_dates=True)

df['dayoftheweek'] = df.index.dayofweek #Weekly Seasonality
df['hourofday'] = df.index.hour #Daily Seasonality
#df['time'] = df.index.astype(np.int64) #Yearly Trend|
df['time'] = df.index.date - df.index.date.min()
df['time'] = df['time'].apply(lambda x : x.days)


#Features that are useless but needed to find other features
df['month'] = df.index.month.astype("int")

df["is_weekend"] = (df.index.dayofweek >= 5).astype("int")

if include_seasons == True:    
    df["is_winter"] = df["month"].isin([1, 2, 12])
    df["is_sprint"] = df["month"].isin([3, 4, 5])
    df["is_summer"] = df["month"].isin([6, 7, 8])
    df["is_autumn"] = df["month"].isin([9, 10, 11])


df["working_hours"] =  df["hourofday"].isin(np.arange(8, 21, 1)).astype("int")

df["morning_peak_hour"] =  df["hourofday"].isin(np.arange(7, 10, 1)).astype("int")
df["evening_peak_hour"] =  df["hourofday"].isin(np.arange(17, 21, 1)).astype("int")

df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
df['Dew_Point'] = 243.12*(np.log(df['relative_humidity'] * 0.01) + 
                          (17.62 * df['deg_C'])/(243.12+df['deg_C']))/(17.62-(np.log(df['relative_humidity'] * 0.01)+17.62*df['deg_C']/(243.12+df['deg_C'])))

df.to_csv('modefied_train.csv')

df.tail()

In [None]:
#New features

new_df['date_time'] = mdf['Date'] +' ' +  mdf['Time']
new_df['date_time'] = pd.to_datetime(new_df['date_time'])

new_df['dayoftheweek'] =  new_df['date_time'].dt.dayofweek #Weekly Seasonality
new_df['hourofday'] =  new_df['date_time'].dt.hour #Daily Seasonality
#new_df['time'] =  new_df['date_time'].astype(np.int64) #Yearly Trend|
new_df['time'] = new_df['date_time'].dt.date - new_df['date_time'].dt.date.min()
new_df['time'] = new_df['time'].apply(lambda x : x.days)


#Features that are useless but needed to find other features
new_df['month'] =  new_df['date_time'].dt.month.astype("int")

new_df["is_weekend"] = ( new_df['date_time'].dt.dayofweek >= 5).astype("int")
if include_seasons == True:
    new_df["is_winter"] = new_df["month"].isin([1, 2, 12])
    new_df["is_sprint"] = new_df["month"].isin([3, 4, 5])
    new_df["is_summer"] = new_df["month"].isin([6, 7, 8])
    new_df["is_autumn"] = new_df["month"].isin([9, 10, 11])


new_df["morning_peak_hour"] =  new_df["hourofday"].isin(np.arange(7, 10, 1)).astype("int")
new_df["evening_peak_hour"] =  new_df["hourofday"].isin(np.arange(17, 21, 1)).astype("int")

new_df['SMC'] = (new_df['absolute_humidity'] * 100) / new_df['relative_humidity']

new_df["working_hours"] =  new_df["hourofday"].isin(np.arange(8, 21, 1)) #.astype("int")

new_df['Dew_Point'] = 243.12*(np.log(new_df['relative_humidity'] * 0.01) + 
                          (17.62 * new_df['deg_C'])/(243.12+new_df['deg_C']))/(17.62-(np.log(new_df['relative_humidity'] * 0.01)+17.62*new_df['deg_C']/(243.12+new_df['deg_C'])))

new_df.to_csv('Orignal_data_new_features_added.csv')
new_df.head()

In [None]:
def Plot_diff_log_versions( y_data , hue_data):
    total_plots = 5
    
    sensor_data = ['sensor_1' , 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
    
    for x_col in sensor_data:
        
        plt.figure(figsize = (20,4))
        x = df[x_col]
        
        plt.subplot(1,total_plots,1)
        y = y_data
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Normal')

        plt.subplot(1,total_plots,2)
        y = np.log(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + '  Target Log')

        plt.subplot(1,total_plots,3)
        y = np.log1p(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Log1p')

        plt.subplot(1,total_plots,4)
        y = np.sqrt(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target sqrt')

        plt.subplot(1,total_plots,5)
        y = np.sqrt(y_data)
        y = np.log(y)
        sns.scatterplot(x = x , y = y, hue = hue_data )
        plt.title(x_col + ' Target sqrt + Log')
    

In [None]:
y_data = df['target_benzene']
Plot_diff_log_versions( y_data )

In [None]:
def Plot_diff_versions_and_sub_versions( y_data1 , y_data2 , hue_data):
    total_plots = 10
    
    sensor_data = ['sensor_1' , 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
    
    for x_col in sensor_data:
        
        plt.figure(figsize = (40,4))
        x = df[x_col]
        
        
        y_data = y_data1
        
        plt.subplot(1,total_plots,1)
        y = y_data
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Normal')

        plt.subplot(1,total_plots,2)
        y = np.log(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + '  Target Log')

        plt.subplot(1,total_plots,3)
        y = np.log1p(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Log1p')

        plt.subplot(1,total_plots,4)
        y = np.sqrt(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target sqrt')

        plt.subplot(1,total_plots,5)
        y = np.sqrt(y_data)
        y = np.log(y)
        sns.scatterplot(x = x , y = y, hue = hue_data )
        plt.title(x_col + ' Target sqrt + Log')
        
        
        y_data = y_data2
        
        plt.subplot(1,total_plots,6)
        y = y_data
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Normal')

        plt.subplot(1,total_plots,7)
        y = np.log(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + '  Target Log')

        plt.subplot(1,total_plots,8)
        y = np.log1p(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target Log1p')

        plt.subplot(1,total_plots,9)
        y = np.sqrt(y_data)
        sns.scatterplot(x = x , y = y , hue = hue_data)
        plt.title(x_col + ' Target sqrt')

        plt.subplot(1,total_plots,10)
        y = np.sqrt(y_data)
        y = np.log(y)
        sns.scatterplot(x = x , y = y, hue = hue_data )
        plt.title(x_col + ' Target sqrt + Log')
    

In [None]:
y_data1 = df['target_benzene'].loc[df['working_hours'] == 0]
y_data2 = df['target_benzene'].loc[df['working_hours'] == 1]
hue_data = df['working_hours']

Plot_diff_versions_and_sub_versions( y_data1 , y_data2 , hue_data)

In [None]:
y_data = df['target_carbon_monoxide']
Plot_diff_log_versions( y_data )

In [None]:
df.columns

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", index_col=  'date_time' , parse_dates=True)

df['dayoftheweek'] = df.index.dayofweek #Weekly Seasonality
df['hourofday'] = df.index.hour #Daily Seasonality
#df['time'] = df.index.date - df.index.date.min()
#df['time'] = df['time'].apply(lambda x : x.days)


#Features that are useless but needed to find other features
#df['month'] = df.index.month.astype("int")

df["is_weekend"] = (df.index.dayofweek >= 5).astype("int")

if include_seasons == True:    
    df["is_winter"] = df["month"].isin([1, 2, 12])
    df["is_sprint"] = df["month"].isin([3, 4, 5])
    df["is_summer"] = df["month"].isin([6, 7, 8])
    df["is_autumn"] = df["month"].isin([9, 10, 11])


df["working_hours"] =  df["hourofday"].isin(np.arange(8, 21, 1)).astype("int")

#df["morning_peak_hour"] =  df["hourofday"].isin(np.arange(7, 10, 1)).astype("int")
#df["evening_peak_hour"] =  df["hourofday"].isin(np.arange(17, 21, 1)).astype("int")

#df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
#df['Dew_Point'] = 243.12*(np.log(df['relative_humidity'] * 0.01) + 
#                          (17.62 * df['deg_C'])/(243.12+df['deg_C']))/(17.62-(np.log(df['relative_humidity'] * 0.01)+17.62*df['deg_C']/(243.12+df['deg_C'])))

df.head()


In [None]:
sns.pairplot(df ,  y_vars = target , hue='hourofday')

In [None]:

sns.pairplot(df ,  y_vars = target , hue='working_hours')

In [None]:
sns.pairplot(df ,  y_vars = target , hue='is_weekend')

In [None]:
target = ['target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']


'''train_features = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
#       'date_time',
            'dayoftheweek', 'hourofday', 
#                  'time', 'month', 
                  'is_weekend',
       'working_hours', 'SMC', 'Dew_Point']'''

train_features = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'dayoftheweek', 'hourofday', 'time', 'month', 'is_weekend',
       'working_hours', 'morning_peak_hour', 'evening_peak_hour' , 'SMC', 'Dew_Point']

new_df = new_df[train_features + target]
new_df

# Log scaling data

In [None]:
if Use_orig_data == True:
    df = pd.concat([df , new_df])
    df

target = ['target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']

def log_scaling(col):
  col = np.log(col)
  return col

def log_scaling_1p(col):
  col = np.log1p(col)
  return col

df[target[0]] = log_scaling(df[target[0]])
df[target[1]] = log_scaling(df[target[1]])
df[target[2]] = log_scaling(df[target[2]])
df[target].describe()

# Loading test files

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv', index_col='date_time', parse_dates=True)
test['dayoftheweek'] = test.index.dayofweek
test['hourofday'] = test.index.hour
#test['time'] = test.index.astype(np.int64)
test['month'] = test.index.month.astype("int")

#new_df['time'] =  new_df['date_time'].astype(np.int64) #Yearly Trend|
test['time'] = test.index.date - test.index.date.min()
test['time'] = test['time'].apply(lambda x : x.days)


test["morning_peak_hour"] =  test["hourofday"].isin(np.arange(7, 10, 1)).astype("int")
test["evening_peak_hour"] =  test["hourofday"].isin(np.arange(17, 21, 1)).astype("int")



test["is_weekend"] = (test.index.dayofweek >= 5).astype("int")
if include_seasons == True:
    test["is_winter"] = test["month"].isin([1, 2, 12])
    test["is_sprint"] = test["month"].isin([3, 4, 5])
    test["is_summer"] = test["month"].isin([6, 7, 8])
    test["is_autumn"] = test["month"].isin([9, 10, 11])

test["working_hours"] =  test["hourofday"].isin(np.arange(8, 21, 1)).astype("int")
test['SMC'] = (test['absolute_humidity'] * 100) / test['relative_humidity']
test['Dew_Point'] = 243.12*(np.log(test['relative_humidity'] * 0.01) + 
                          (17.62 * test['deg_C'])/(243.12+test['deg_C']))/(17.62-(np.log(test['relative_humidity'] * 0.01)+17.62*test['deg_C']/(243.12+test['deg_C'])))

#t = TabularDataset(test)
#t.head()

In [None]:
train_df = df
test_df = test

In [None]:
train_df

# Model Training - AutoGluon

We can now begin the AutoML process.
Note that in this kaggle notebook I severely reduced the time limit on the autoML process, so please increase it if you intend to follow it

In [None]:
!pip -q install "mxnet<2.0.0"
!pip -q install autogluon

In [None]:
from autogluon.tabular import TabularDataset , TabularPredictor

In [None]:
train_df.columns

In [None]:
tr

In [None]:
TIME_LIMIT = 300

feature = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'dayoftheweek', 'hourofday', 'time', 'month', 'is_weekend',
       'working_hours', 'morning_peak_hour','evening_peak_hour', 'SMC', 'Dew_Point']
print(len(feature))

In [None]:
feature = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'dayoftheweek', 'hourofday', 'time', 'month', 'is_weekend',
       'working_hours', 'morning_peak_hour','evening_peak_hour', 'SMC', 'Dew_Point']

label = target[0]
train_data = train_df[feature + [label]]

print('Starting training for' , label)
save_path = './predictor0/'
predictor0 = TabularPredictor(label = label , path = save_path , verbosity=2).fit(train_data ,presets='best_quality', 
                                                num_stack_levels = 3,  num_bag_folds = 5, num_bag_sets = 3,time_limit=TIME_LIMIT)

print()





In [None]:
feature = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'dayoftheweek', 'hourofday', 'time', 'month', 'is_weekend',
       'working_hours', 'morning_peak_hour','evening_peak_hour', 'SMC', 'Dew_Point']

label = target[1]
train_data = train_df[feature + [label]]
save_path = './predictor1/'
print('Starting training for' , label)
predictor1 = TabularPredictor(label = label , path = save_path , verbosity=2).fit(train_data ,presets='best_quality', 
                              num_stack_levels = 3,  num_bag_folds = 5, num_bag_sets = 3,time_limit=TIME_LIMIT)
print()





In [None]:
feature = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
#       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'dayoftheweek', 'hourofday', 'time', 'month', 'is_weekend',
       'working_hours', 'morning_peak_hour','evening_peak_hour', 'SMC', 'Dew_Point']

label = target[2]
train_data = train_df[feature + [target[2]]]
save_path = './predictor2/'
print('Starting training for' ,  label)

predictor2 = TabularPredictor(label = label , path = save_path , verbosity=2).fit(train_data ,presets='best_quality', 
                              num_stack_levels = 3,  num_bag_folds = 5, num_bag_sets = 3,time_limit=TIME_LIMIT)
print()

In [None]:
predictions0 = predictor0.predict(test_df)
predictions0 = np.exp(predictions0)

predictions1 = predictor1.predict(test_df)
predictions1 = np.exp(predictions1)

predictions2 = predictor2.predict(test_df)
predictions2 = np.exp(predictions2)

submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

submission[target[0]] = np.vstack( [predictions0] ).T
submission[target[1]] = np.vstack( [predictions1] ).T
submission[target[2]] = np.vstack( [predictions2] ).T
submission.to_csv('./1_submission.csv', index=False)
submission

# Lets delete some files to save space for Kaggle 

In [None]:
predictor0.delete_models(models_to_keep='best', dry_run=False)
predictor1.delete_models(models_to_keep='best', dry_run=False)
predictor2.delete_models(models_to_keep='best', dry_run=False)

predictor0.save_space()
predictor1.save_space()
predictor2.save_space()


In [None]:
predictor0.leaderboard()
predictor1.leaderboard()
predictor2.leaderboard()

In [None]:
label = target[2]
train_data = train_df[feature + [target[2]]]
train_data

In [None]:
label = target[0]
train_data = train_df[feature + target]
train_sample = train_data.sample(500)

In [None]:
predictor0.feature_importance(train_sample)

In [None]:
predictor1.feature_importance(train_sample)

In [None]:
predictor2.feature_importance(train_sample)