#1. Regresión lineal múltiple para calidad del aire
#Air Quality Data Set
Descripción: Contiene las respuestas de un dispositivo multisensor de gas desplegado sobre el terreno en una ciudad italiana. Los promedios de respuestas por hora se registran junto con referencias de concentraciones de gas de un analizador certificado.

Instancias: 93584

Atributos: 15

0. Fecha (DD / MM / AAAA)
1. Hora (HH.MM.SS)
2. Concentración real promedio por hora de CO en mg / m ^ 3 
3. PT08.S1 (óxido de estaño) 
4. Concentración de hidrocarburos no metánicos total promediada por hora real en microg / m ^ 3 
5. Concentración de benceno promediada por hora real en microg / m ^ 3 
6. PT08.S2 (titania) respuesta del sensor promediada por hora
7. Concentración de NOx promediada por hora real en ppb
8. PT08.S3 (óxido de tungsteno) respuesta del sensor promediada por hora 
9. Concentración verdadera de NO2 promediada por hora en microg / m ^ 3 
10. PT08.S4 (óxido de tungsteno) respuesta del sensor promediada por hora 
11. PT08.S5 (óxido de indio) respuesta del sensor promediada por hora 
12. Temperatura en ° C
13. Humedad relativa (%)
14. AH de humedad absoluta

#2.Autenticación Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#3. Importando módulos

In [None]:
import pandas as pd
import os
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

#4. Lectura de archivo de calidad de aire

In [None]:
path = r'/content/drive/Shareddrives/Data Science para Geociencias/6. Métodos de ML/6.1 Regresión lineal múltiple'
name = 'AirQualityUCI.xlsx'

In [None]:
path_file = os.path.join(path,name)

In [None]:
air_quality = pd.read_excel(path_file)
air_quality.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [None]:
print('Número de instancias: %s'%len(air_quality))
print('Número de atributos: %s'%(air_quality.shape[1]))

Número de instancias: 9357
Número de atributos: 15


#5. Limpieza de datos

Modificando la columna de fecha

In [None]:
air_quality.loc[:,'Fecha'] = pd.to_datetime(air_quality.Date.astype(str)+' '+air_quality.Time.astype(str))
air_quality.drop(['Date','Time'], axis=1, inplace=True)
air_quality = air_quality[['Fecha','CO(GT)','PT08.S1(CO)','NMHC(GT)','C6H6(GT)','PT08.S2(NMHC)','NOx(GT)','PT08.S3(NOx)','NO2(GT)','PT08.S4(NO2)','PT08.S5(O3)','RH','AH','T']]
air_quality.head()

Unnamed: 0,Fecha,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,T
0,2004-03-10 18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,48.875001,0.757754,13.6
1,2004-03-10 19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,47.7,0.725487,13.3
2,2004-03-10 20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,53.975,0.750239,11.9
3,2004-03-10 21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,60.0,0.786713,11.0
4,2004-03-10 22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,59.575001,0.788794,11.15


In [None]:
fig=go.Figure()
fig.layout.template = "ggplot2" 
fig.add_scatter(x=air_quality['Fecha'], y=air_quality['T'], mode='lines',name='T')
fig.update_traces(marker=dict(size=3),
                  selector=dict(mode='markers'))
pio.show(fig)

 Eliminando líneas con Nans

In [None]:
air_quality.dropna(inplace=True)

Identificación y reemplazo de outliers

In [None]:
for i in range(air_quality.shape[1]-1):
  air_quality.iloc[:,i+1] = air_quality.iloc[:,i+1].replace(to_replace=-200, value=air_quality.iloc[:,i+1][air_quality.iloc[:,i+1]>0].mean())

In [None]:
fig=go.Figure()
fig.add_scatter(x=air_quality['Fecha'], y=air_quality['T'], mode='lines',name='T')
fig.update_traces(marker=dict(size=3),
                  selector=dict(mode='markers'))
pio.show(fig)

In [None]:
air_quality.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,T
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,2.15275,1099.707856,218.811816,10.082993,939.029205,246.881252,835.370973,113.075515,1456.143486,1022.780725,49.23236,1.02553,18.317216
std,1.316068,212.796116,63.870229,7.302474,261.557856,193.419417,251.741784,43.911095,339.365351,390.609,16.974308,0.395836,8.658398
min,0.1,647.25,7.0,0.149048,383.25,2.0,322.0,2.0,551.0,221.0,9.175,0.184679,-1.9
25%,1.2,941.25,218.811816,4.591495,742.5,112.0,665.5,85.9,1241.5,741.75,36.55,0.746115,12.025
50%,2.15275,1074.5,218.811816,8.593367,923.25,229.0,817.5,113.075515,1456.143486,982.5,49.23236,1.015441,18.275
75%,2.6,1221.25,218.811816,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,61.875,1.296223,24.075
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,88.725,2.231036,44.6


Escalamiento: estandarización

In [None]:
scale = StandardScaler()
air_quality.iloc[:,1:] = scale.fit_transform(air_quality.iloc[:,1:])

In [None]:
air_quality.head()

Unnamed: 0,Fecha,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,T
0,2004-03-10 18:00:00,0.339856,1.223265,-1.077427,0.246331,0.407086,-0.418187,0.87745,-0.00172,0.69503,0.62654,-0.021054,-0.67652,-0.544843
1,2004-03-10 19:00:00,-0.116071,0.904868,-1.672415,-0.093922,0.060108,-0.743922,1.344223,-0.479984,0.302364,-0.129371,-0.09028,-0.758039,-0.579493
2,2004-03-10 20:00:00,0.035905,1.420648,-2.048197,-0.148612,0.000844,-0.599151,1.21015,0.021055,0.28984,0.131134,0.279416,-0.695505,-0.741195
3,2004-03-10 21:00:00,0.035905,1.296109,-2.173458,-0.11698,0.035255,-0.387165,1.019468,0.203251,0.376035,0.462045,0.634383,-0.603357,-0.845146
4,2004-03-10 22:00:00,-0.420023,0.810876,-2.627528,-0.488185,-0.395839,-0.599151,1.468365,0.066604,0.09977,0.223302,0.609344,-0.598098,-0.82782


Validación

#6. Hold out validation para series de tiempo


In [None]:
n_train = int(0.8*air_quality.shape[0]) 
n_test = air_quality.shape[0]-n_train

In [None]:
train_df = air_quality.iloc[:n_train,:]
test_df = air_quality.iloc[n_train:,:]
print(f'Dimension de los datos de entrenamiento {train_df.shape}')
print(f'Dimension de los datos de prueba {test_df.shape}')

Dimension de los datos de entrenamiento (7485, 14)
Dimension de los datos de prueba (1872, 14)


In [None]:
Xtrain = train_df.iloc[:,1:-1].to_numpy()
Ytrain = train_df.iloc[:,-1].to_numpy()
Xtest = test_df.iloc[:,1:-1].to_numpy()
Ytest = test_df.iloc[:,-1].to_numpy()

#7. Modelo de inferencia: Regresión lineal múltiple

Instanciando regresor

In [None]:
linreg = LinearRegression(normalize=False)

Ajuste con datos de entrenamiento

In [None]:
linreg = linreg.fit(Xtrain, Ytrain)

RMS de entrenamiento

In [None]:
linreg.score(Xtrain, Ytrain)

0.933711238084812

#8. Infieriendo temperaturas de prueba 

In [None]:
preds = linreg.predict(Xtest)

linreg.score(Xtest, Ytest)

0.6471226761963146

Transformación de las temperaturas inversa

In [None]:
test_cpy = test_df.iloc[:,1:].copy()
test_cpy = test_cpy.assign(T=preds)
test_cpy.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,T
7485,-1.103915,-1.255766,-8.900306e-16,-1.107588,-1.358392,-0.724791,1.225047,-0.753278,-1.873136,-1.432534,-0.984326,-1.495856,-0.351519
7486,-0.799963,-1.04546,-8.900306e-16,-0.992189,-1.136632,-0.368035,0.800979,-0.019939,-1.761156,-1.173309,-0.957814,-1.496819,-0.340322
7487,-0.799963,-0.974966,-8.900306e-16,-0.934935,-1.037223,-0.344251,0.719542,0.185031,-1.721374,-1.001773,-0.83851,-1.485052,-0.415201
7488,-0.723975,-0.944419,-8.900306e-16,-0.921822,-1.015238,-0.18035,0.659954,0.437828,-1.660964,-0.848799,-0.487961,-1.443459,-0.636228
7489,-0.116071,-0.432164,-8.900306e-16,-0.410636,-0.300253,0.433375,0.01541,1.057295,-1.41122,-0.204898,-0.632305,-1.522781,-0.554088


In [None]:
predicted_test = scale.inverse_transform(test_cpy)
groundtruth_test = scale.inverse_transform(test_df.iloc[:,1:])

In [None]:
test_predicted_new = pd.DataFrame(predicted_test, columns=['CO(GT)','PT08.S1(CO)','NMHC(GT)','C6H6(GT)','PT08.S2(NMHC)','NOx(GT)','PT08.S3(NOx)','NO2(GT)','PT08.S4(NO2)','PT08.S5(O3)','RH','AH','T'])
test_groundtruth_new = pd.DataFrame(groundtruth_test, columns=['CO(GT)','PT08.S1(CO)','NMHC(GT)','C6H6(GT)','PT08.S2(NMHC)','NOx(GT)','PT08.S3(NOx)','NO2(GT)','PT08.S4(NO2)','PT08.S5(O3)','RH','AH','T'])

In [None]:
fig=go.Figure()
fig.add_scatter(x=test_df['Fecha'], y=test_groundtruth_new['T'], mode='lines',name='Ground Truth')
fig.add_scatter(x=test_df['Fecha'], y=test_predicted_new['T'], mode='lines',name='Predicted')
fig.update_traces(marker=dict(size=3),
                  selector=dict(mode='markers'))
pio.show(fig)