# Chemical gas sensor drift compensation

## prediction

Purpose: prediction of VOC.

dataset source: https://archive.ics.uci.edu/dataset/270/gas+sensor+array+drift+dataset+at+different+concentrations


In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

from matplotlib import pyplot as plt # import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


import os
import re

path = 'Dataset/'

## Структура данных

Каждый отклик описан 8 признаками:  
* Δ𝑅, ‖Δ𝑅‖, 𝑒𝑚𝑎0.001𝐼, 𝑒𝑚𝑎0.01𝐼, 𝑒𝑚𝑎0.1𝐼, 𝑒𝑚𝑎0.001𝐷, 𝑒𝑚𝑎0.01𝐷, and 𝑒𝑚𝑎0.1𝐷. Регистрируется 16 сенсорами, итого каждый отклик описан 128 признаками.

Используемые аналиты: 
* 1: Ethanol; 2: Ethylene; 3: Ammonia; 4: Acetaldehyde; 5: Acetone; 6: Toluene

Строка состоит из индикатора аналита (1-6), концентрации, и 128 признаков:
* 1;10.000000 1:15596.162100 2:1.868245 3:2.371604 ... 127:-0.902241 128:-2.654529 

In [2]:
# Предыдущий вариант импорта был хорош для рисования, хотя можно было изменить трехмерное обращение к данным на добавление индекса +2, чтобы не затрагивать первые колонки

gas_data = []

voc = {1: 'Ethanol', 2: 'Ethylene', 3: 'Ammonia', 4: 'Acetaldehyd', 5: 'Acetone', 6: 'Toluene'}

with open(path + 'batch1.dat', 'r') as file_data:
    for i, line in enumerate(file_data.readlines()):
#        gas_type = int(line[0])
#        rest = line[1:]
        data = line.strip().split(' ') 
        gas_type, conc = data[0].strip().split(';')
        gas_type = voc[int(gas_type)]
#        print(data)
        # split in pairs by spaces and then split pairs by colons    
        x, y = zip(*[(int(i.strip().split(':')[0]), float(i.strip().split(':')[1])) for i in data[1:]])
        gas_data.append([gas_type] + [conc] + list(y))
#        gas_data.append((gas_type, float(conc), (x, y)))

In [3]:
features = ('R', '‖ΔR‖', 'ema0.001l', 'ema0.01l', 'ema0.1l', 'ema0.001D', 'ema0.01D', 'ema0.1D')
sensors = [str(i) for i in range(16)]

df_columns = ['VOC', 'conc']
for i in sensors:
    for j in features:
        df_columns.append('s' + i + '_' + j)


df = pd.DataFrame(gas_data, columns=df_columns)
print(df)

         VOC       conc         s0_R    s0_‖ΔR‖  s0_ema0.001l  s0_ema0.01l  \
0    Ethanol  10.000000   15596.1621   1.868245      2.371604     2.803678   
1    Ethanol  20.000000   26402.0704   2.532401      5.411209     6.509906   
2    Ethanol  30.000000   42103.5820   3.454189      8.198175    10.508439   
3    Ethanol  40.000000   42825.9883   3.451192     12.113940    16.266853   
4    Ethanol  50.000000   58151.1757   4.194839     11.455096    15.715298   
..       ...        ...          ...        ...           ...          ...   
440  Toluene  10.000000   74805.0518   6.707129     15.446750    19.415134   
441  Toluene  15.000000   92035.5156   7.775487     21.173590    27.620422   
442  Toluene  20.000000  107898.2334   8.994761     25.131079    33.771374   
443  Toluene  25.000000  119795.0352   9.582606     28.944716    39.290350   
444  Toluene  35.000000  140782.2978  10.975342     35.524802    59.584134   

     s0_ema0.1l  s0_ema0.001D  s0_ema0.01D  s0_ema0.1D  ...  s1

## Prediction

Сперва датафрейм где только основные признаки, в комплект к полному. А то если раньше можно было по прежнему индексу обращаться к датасету, то теперь-то он разделиться на две части и индексация потеряется.

И  далее подставлять тот или иной набор данных.

In [4]:
#p = list(range(len(df_columns)))
#main_comp = [v for i, v in enumerate(p) if ((i - 2) % 8 in {0, 1} or i in {0, 1})]

main_comp = [v for i, v in enumerate(df_columns) if ((i - 2) % 8 in {0, 1} or i in {0, 1})]
df2 = df.loc[:,main_comp]
print(df2)

         VOC       conc         s0_R    s0_‖ΔR‖         s1_R   s1_‖ΔR‖  \
0    Ethanol  10.000000   15596.1621   1.868245   15326.6914  1.768526   
1    Ethanol  20.000000   26402.0704   2.532401   23855.7812  2.164706   
2    Ethanol  30.000000   42103.5820   3.454189   37562.3008  2.840403   
3    Ethanol  40.000000   42825.9883   3.451192   38379.0664  2.851173   
4    Ethanol  50.000000   58151.1757   4.194839   51975.5899  3.480866   
..       ...        ...          ...        ...          ...       ...   
440  Toluene  10.000000   74805.0518   6.707129   71893.4726  5.953645   
441  Toluene  15.000000   92035.5156   7.775487   88619.7480  6.945339   
442  Toluene  20.000000  107898.2334   8.994761  103904.5332  7.910543   
443  Toluene  25.000000  119795.0352   9.582606  115551.0313  8.425875   
444  Toluene  35.000000  140782.2978  10.975342  135883.9053  9.623297   

          s2_R   s2_‖ΔR‖       s3_R   s3_‖ΔR‖  ...      s11_R  s11_‖ΔR‖  \
0    2789.3831  2.754759  2581.5686 

### Random search

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from sklearn.metrics import classification_report


# Grid Search

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler # "pip install scikit-learn", not "pip install sklearn"
from sklearn.decomposition import PCA as sk_pca



from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier


pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000)), # increased from default (100) https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
    ]
)




In [11]:
# select columns with main features
skpca = sk_pca(n_components=3)

df_train, df_test = train_test_split(df, test_size=0.2) # или подставлять dfdf2  для другого набора

sc = preprocessing.MinMaxScaler() 

X_train = skpca.fit_transform(sc.fit_transform(df_train.values[:, 2:]))
#X_test = sc.transform(df_test.values[:, 2:])
Y_train = df_train.values[:, 0]
X_test = skpca.fit_transform(sc.transform(df_test.values[:, 2:]))
y_test = df_test.values[:, 0]

#print(df_test)

In [14]:
model = pipeline.fit(X_train, Y_train)
y_pred_simple = model.predict(X_test)

print(classification_report(y_test, y_pred_simple, zero_division=0))

              precision    recall  f1-score   support

 Acetaldehyd       0.00      0.00      0.00         5
     Acetone       0.40      0.62      0.48        13
     Ammonia       0.11      0.14      0.12        22
     Ethanol       0.64      0.56      0.60        16
    Ethylene       0.00      0.00      0.00        20
     Toluene       0.36      0.31      0.33        13

    accuracy                           0.27        89
   macro avg       0.25      0.27      0.26        89
weighted avg       0.25      0.27      0.26        89



In [20]:
# Оптимизация гиперпараметров

parameters = {
    'scaler__with_mean': [True, False],
    'clf__C': np.linspace(0.01, 1, 10),
    'clf__penalty': ['l2'], #  было 'clf__penalty': ['l2', None] - надоели предупреждения "Setting penalty=None will ignore the C and l1_ratio parameters" 
    'clf__random_state': [2023],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=parameters,
    n_jobs=2,
    verbose=1,
)


grid_search.fit(X_train, Y_train)


best_parameters = grid_search.best_estimator_.get_params()

y_pred_optimized = grid_search.best_estimator_.predict(X_test)
print('grid classification_report: \n', classification_report(y_test, y_pred_optimized, zero_division=0))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
grid classification_report: 
               precision    recall  f1-score   support

 Acetaldehyd       0.00      0.00      0.00         5
     Acetone       0.40      0.62      0.48        13
     Ammonia       0.11      0.14      0.12        22
     Ethanol       0.64      0.56      0.60        16
    Ethylene       0.00      0.00      0.00        20
     Toluene       0.36      0.31      0.33        13

    accuracy                           0.27        89
   macro avg       0.25      0.27      0.26        89
weighted avg       0.25      0.27      0.26        89



In [21]:
# Randomized Search

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameters,
    n_iter=10,
    random_state=2023,
    n_jobs=4,
    verbose=1,
)

random_search.fit(X_train, Y_train)

best_parameters = random_search.best_estimator_.get_params()

y_pred_optimized = random_search.best_estimator_.predict(X_test)
report = classification_report(y_test, y_pred_optimized, zero_division=0) # 
print('random classification_report: \n', report)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
random classification_report: 
               precision    recall  f1-score   support

 Acetaldehyd       0.00      0.00      0.00         5
     Acetone       0.40      0.62      0.48        13
     Ammonia       0.11      0.14      0.12        22
     Ethanol       0.64      0.56      0.60        16
    Ethylene       0.00      0.00      0.00        20
     Toluene       0.36      0.31      0.33        13

    accuracy                           0.27        89
   macro avg       0.25      0.27      0.26        89
weighted avg       0.25      0.27      0.26        89

