<div>
    <img src='https://storage.googleapis.com/kaggle-datasets-images/340483/674955/c74e20b41b2cdacf6915881df995a5d8/dataset-cover.png'/>
</div>

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from IPython.display import clear_output

from sklearn.model_selection import train_test_split

<h1 id="dataset" style="color:#549f98; background:#e3ebeb; border:0.5px dotted;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
path = '../input/sofia-air-quality-dataset/2017-07_bme280sof.csv'
df = pd.read_csv(path)
df = df[['pressure','temperature','humidity']]
df.head()

In [None]:
df_idx = pd.DataFrame({'idx':np.arange(len(df))})
df = pd.concat([df, df_idx], axis=1)
df.head()

In [None]:
# use the humidity as labels
# low humidity < 30
# middle humidity > 30, < 70
# highi humidity > 70

low_index = df[df['humidity'] <= 30].index
mid_index = df[(df['humidity'] > 30)&(df['humidity'] < 70)].index
high_index = df[df['humidity'] >= 70].index

In [None]:
df.loc[low_index, 'humidity'] = 0
df.loc[mid_index, 'humidity'] = 1
df.loc[high_index, 'humidity'] = 2
df['humidity'] = df['humidity'].astype(int)

df_orig = df.copy()
max_len = len(df)

indexes = []
results = []
first = True

df = df.sample(frac=0.05)
df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=42)

<h1 id="implementation" style="color:#549f98; background:#e3ebeb; border:0.5px dotted;"> 
    <center>Implementation
        <a class="anchor-link" href="#implementation" target="_self">¶</a>
    </center>
</h1>

In [None]:
while(len(indexes) < max_len):
    
    # split the train and test depending on the indexes
    if(first == False):
        if(len(indexes) > 0):
            df_temp = df_test[df_test['idx'].isin(indexes)]
            df = pd.concat([df, df_temp])
        else:
            df = df_orig.sample(frac=0.01)
        
    df_temp_test = df_orig[~df_orig.idx.isin(indexes)].sample(frac=0.1)
    df = pd.concat([df, df_temp_test])
    
    df_train, df_test = train_test_split(df, test_size=0.1, shuffle=True, random_state=42)
    
    NUM_EXAMPLES = len(df_train)

    def make_input_fn(X, y, n_epochs=None, shuffle=True):
        def input_fn():
            dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
            if shuffle:
                dataset = dataset.shuffle(NUM_EXAMPLES)
            # For training, cycle thru dataset as many times as need (n_epochs=None).
            dataset = dataset.repeat(n_epochs)
            # In memory training doesn't use batching.
            dataset = dataset.batch(NUM_EXAMPLES)
            return dataset
        return input_fn

    # Training and evaluation input functions.
    train_input_fn = make_input_fn(df_train, df_train['humidity'])
    eval_input_fn = make_input_fn(df_test, df_test['humidity'], shuffle=False, n_epochs=1)
    
    
    if(first == True):
        NUMERIC_COLUMNS = ['pressure', 'temperature']

        feature_columns = []
        for feature_name in NUMERIC_COLUMNS:
            feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                   dtype=tf.float32))

        est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                                  n_classes=3,
                                                  n_batches_per_layer=8)
    
    # train boost tree
    est.train(train_input_fn, max_steps=300)
    
    # get results and predictions
    result = est.evaluate(eval_input_fn)
    predictor = est.predict(eval_input_fn)

    # list of indexes
    index_arr = list(df_test.index)

    for i, pred in enumerate(predictor):
        if(pred['probabilities'].max() > 0.95):
            indexes.append(index_arr[i])
            
    clear_output(wait=True)
    print('nr_index:{:}'
          .format(len(indexes)))
    print(pd.Series(result))
    results.append(result)
            
    first = False

<h1 id="analysis" style="color:#549f98; background:#e3ebeb; border:0.5px dotted;"> 
    <center>Analysis
        <a class="anchor-link" href="#analysis" target="_self">¶</a>
    </center>
</h1>

In [None]:
accs, losses = [], []
for result in results:
    accs.append(result['accuracy'])
    losses.append(result['average_loss'])

In [None]:
plt.figure(figsize=(14,8))
plt.title('Accuracies')
plt.plot(accs)

In [None]:
plt.figure(figsize=(14,8))
plt.title('Losses')
plt.plot(losses)