In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')

In [6]:
# Transformei a coluna 'date' em index
df.set_index('date', inplace=True)

In [7]:
df

Unnamed: 0_level_0,city,co,no2,so2,o3,pm2.5,pm10,aqi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-01 00:00:00+00:00,Brasilia,0.482248,0.369558,-1.588771,-0.884593,-0.189310,-0.287442,-0.841451
2023-01-01 01:00:00+00:00,Brasilia,0.476619,0.321611,-1.540631,-0.848075,-0.077862,-0.167640,-0.865581
2023-01-01 02:00:00+00:00,Brasilia,0.464007,0.243255,-1.449682,-0.816150,-0.033880,-0.123259,-0.915668
2023-01-01 03:00:00+00:00,Brasilia,0.384375,0.077862,-1.401177,-0.723343,-0.392492,-0.478025,-0.967265
2023-01-01 04:00:00+00:00,Brasilia,0.188033,-0.135911,-1.449682,-0.596935,-0.741388,-0.833781,-1.059299
...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00+00:00,Sydney,-2.241016,-0.945616,-0.365531,-0.816150,-0.198258,-0.050204,-0.412918
2023-12-31 20:00:00+00:00,Sydney,-2.196894,-0.955471,-0.271787,-0.816150,-0.110627,0.045180,-0.419730
2023-12-31 21:00:00+00:00,Sydney,-2.169694,-0.987652,-0.218773,-0.786745,0.069056,0.234219,-0.435724
2023-12-31 22:00:00+00:00,Sydney,-2.144011,-1.046576,-0.182928,-0.723343,0.006273,0.160010,-0.436369


In [12]:
# Dropei as colunas que não são numéricas
df = df.select_dtypes(include=np.number)

In [13]:
# Dividi o dataframe em X e y
X = df.drop('aqi', axis=1)
y = df['aqi']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
model.fit(X_train, y_train)

In [17]:
# Fazendo a previsão
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

MSE: 0.35511969225259093


In [19]:
# Impotei outros modelos para comparar
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [20]:
# Treinando os modelos
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
svr_model = SVR()

In [21]:
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
svr_model.fit(X_train, y_train)

In [22]:
# Fazendo as previsões
dt_y_pred = dt_model.predict(X_test)
rf_y_pred = rf_model.predict(X_test)
svr_y_pred = svr_model.predict(X_test)

In [23]:
# Calculando as MSEs
dt_mse = mean_squared_error(y_test, dt_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
svr_mse = mean_squared_error(y_test, svr_y_pred)

In [24]:
print('MSE Decision Tree:', dt_mse)
print('MSE Random Forest:', rf_mse)
print('MSE Support Vector Regression:', svr_mse)

MSE Decision Tree: 0.23682175711800335
MSE Random Forest: 0.11699630841041407
MSE Support Vector Regression: 0.13936115780668262


In [25]:
# Testei modelo de boost
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor()

In [26]:
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_mse = mean_squared_error(y_test, gb_y_pred)
print('MSE Gradient Boosting:', gb_mse)

MSE Gradient Boosting: 0.1431641127666303


In [27]:
# Testei modelo de XGBoost
import xgboost as xgb
xgb_model = xgb.XGBRegressor()

In [28]:
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
print('MSE XGBoost:', xgb_mse)

MSE XGBoost: 0.127953995887974


In [29]:
# Verifiquei qual modelo tem menor mse
min_mse = min(dt_mse, rf_mse, svr_mse, gb_mse, xgb_mse)
print('Modelo com menor MSE:', min_mse)

Modelo com menor MSE: 0.11699630841041407


In [30]:
# Extrapi o modelo de Random Forest em joblib
import joblib
joblib.dump(rf_model, 'rf_model.joblib')

['rf_model.joblib']