In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The QLattice 

In this notebook I'll show how the QLattice can be used to predict air polution with almost no knowledge about ML and data science - actually your grandmother could probably easily learn how to operate the QLattice :-)

The QLattice can also be used for much more advanced analysis, but you have to find examples of that in other notebooks - with this one, I want to show how easy it can be, to tell something meaningful from a dataset.  

The QLattice is a supervised machine learning tool for symbolic regression developed by Abzu. It is inspired by Richard Feynman's path integral formulation. That's why the python module to use it is called Feyn, and the Q in QLattice is for Quantum.

Abzu provides free QLattices for non-commercial use to anyone. These free community QLattices gets allocated for you automatically if you use Feyn without an active subscription, as we will do in this notebook. Read more about how it works here: https://docs.abzu.ai/docs/guides/getting_started/community.html

In [None]:
# install the feyn module
!pip install feyn

In [None]:
import feyn
import sklearn.model_selection
import seaborn as sbn

In [None]:
# loading the dataset
df_polution = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
df_polution

In [None]:
# dropping the date_time column
df_polution = df_polution = df_polution.drop(["date_time"], axis=1)

In [None]:
# checking for NaN values
df_polution.isna().sum()

In [None]:
# checking for categorical types in data - if there are categorical types in your dataset, 
# you don't have to encode them, but you'll need to pass information about them to your QLattice. 
# see this page for more help: https://docs.abzu.ai/docs/guides/essentials/stypes.html
df_polution.dtypes

In [None]:
# just a quick look to get an impression of data
sbn.pairplot(df_polution)

In [None]:
# preparing a dataset for predicting each variable
df_target_carbon_monoxide = df_polution
df_target_carbon_monoxide = df_target_carbon_monoxide.drop(["target_benzene", 
                                                            "target_nitrogen_oxides"], 
                                                           axis=1)

df_target_benzene = df_polution
df_target_benzene = df_target_benzene.drop(["target_carbon_monoxide", 
                                            "target_nitrogen_oxides"], 
                                           axis=1)

df_target_nitrogen_oxides = df_polution
df_target_nitrogen_oxides = df_target_nitrogen_oxides.drop(["target_carbon_monoxide", 
                                                            "target_benzene"], 
                                                           axis=1)

In [None]:
# splitting the datasets
df_train_target_carbon_monoxide, df_test_target_carbon_monoxide = sklearn.model_selection.train_test_split(df_target_carbon_monoxide, 
                                                                                                           train_size=.80, 
                                                                                                           random_state=1)
df_train_target_benzene, df_test_target_benzene = sklearn.model_selection.train_test_split(df_target_benzene, 
                                                                                           train_size=.80, 
                                                                                           random_state=1)
df_train_target_nitrogen_oxides, df_test_target_nitrogen_oxides = sklearn.model_selection.train_test_split(df_target_nitrogen_oxides, 
                                                                                                           train_size=.80, 
                                                                                                           random_state=1)



In [None]:
# connect to a qlattice
ql = feyn.connect_qlattice()

In [None]:
# training model for predicting carbon_monoxide
models = ql.auto_run(df_train_target_carbon_monoxide,
                     output_name="target_carbon_monoxide",
                     max_complexity=5)

In [None]:
# plotting the best model the qlattice found
best_model = models[0]
best_model.plot(df_train_target_carbon_monoxide, df_test_target_carbon_monoxide)

In [None]:
# loading test data
df_polution_test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")

In [None]:
# preparring for building submission df
pred_target_carbon_monoxide = best_model.predict(df_polution_test)

In [None]:
# training model for predicting benzene
ql.reset()
models = ql.auto_run(df_train_target_benzene, 
                     output_name="target_benzene",
                     max_complexity=5)

In [None]:
# plotting the best model the qlattice found
best_model = models[0]
best_model.plot(df_train_target_benzene, df_test_target_benzene)

In [None]:
# preparring for building submission df
pred_target_benzene = best_model.predict(df_polution_test)

In [None]:
# training model for predicting nitrogen_oxides
ql.reset()
models = ql.auto_run(df_train_target_nitrogen_oxides, 
                     output_name="target_nitrogen_oxides",
                     max_complexity=5)

In [None]:
# plotting the best model the qlattice found
best_model = models[0]
best_model.plot(df_train_target_nitrogen_oxides, df_test_target_nitrogen_oxides)

In [None]:
# preparring for building submission df
pred_target_nitrogen_oxides = best_model.predict(df_polution_test)

In [None]:
# building submission csv
df_my_submission = pd.DataFrame(columns=["date_time", "target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"])
df_my_submission = df_my_submission.astype({"date_time": object, "target_carbon_monoxide": float, "target_benzene": float, "target_nitrogen_oxides": float})

for i in range(len(df_polution_test)):
    new_row = {"date_time":df_polution_test["date_time"].values[i], 
               "target_carbon_monoxide": pred_target_carbon_monoxide[i],
               "target_benzene": pred_target_benzene[i],
               "target_nitrogen_oxides": pred_target_nitrogen_oxides[i]}    
    df_my_submission = df_my_submission.append(new_row, ignore_index=True)  

df_my_submission.to_csv('submission.csv', index=False)

In [None]:
df_my_submission