## Bibliotecas

In [None]:
# pip install lightning

In [26]:
# Importações constantes

import pandas as pd
import re
import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset

In [27]:
# Coisas definidas previamente

TAMANHO_TESTE = 0.1
TAMANHO_VALIDACAO = 0.1

SEMENTE_ALEATORIA = 13

NUM_EPOCAS = 20

## Dataset

O dataset a seguir contém todos os dados fornecidos pelo Computational 2D Materials Database (C2DB) (https://c2db.fysik.dtu.dk/):

In [28]:
df = pd.read_csv("C2DB_full.csv")
df

Unnamed: 0,Formula ✕,Band gap ✕,2D plasma frequency (x) ✕,2D plasma frequency (y) ✕,Band gap (G₀W₀) ✕,Band gap (HSE06) ✕,"Conduction band effective mass, direction 1 ✕","Conduction band effective mass, direction 2 ✕",Cond. band minimum ✕,Conduction band minimum (G₀W₀) ✕,...,"Stiffness tensor, 32-component ✕","Stiffness tensor, 33-component ✕",Stoichiometry ✕,Mass ✕,Age ✕,ID ✕,Unique identifier ✕,Username ✕,Vacuum level difference ✕,Volume ✕
0,Be4,0.0,-,-,-,-,-,-,-,-,...,-,-,A,36.049,6w,1,Be4-09dd42ad034e,cmr,-0.0,256.436
1,As4O8,3.232,-,-,-,-,1.117,1.155,-3.938,-,...,0.000,-69.689,AB2,427.678,6w,2,As4O8-5242a449d950,cmr,0.0,1411.694
2,Ca4As4,0.998,-,-,-,1.583,1.141,3.465,-1.024,-,...,0.003,26.732,AB,459.998,6w,3,As4Ca4-bf7bbbdbefe0,cmr,-0.0,1426.071
3,Fe4S8,0.0,-,-,-,-,-,-,-,-,...,3.617,55.290,AB2,479.860,6w,4,Fe4S8-897195c26aff,cmr,0.0,1080.629
4,In2Se2,1.63,-,-,-,2.254,0.207,1.067,-2.271,-,...,-0.000,13.382,AB,387.578,6w,5,In2Se2-0a48e35c06ea,cmr,-0.0,761.622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4029,Rh2P2S6,0.0,4.043,4.045,-,0.000,-,-,-,-,...,0.021,36.035,ABC3,460.119,6w,4036,P2Rh2S6-b6a27022f56f,cmr,-0.001,580.987
4030,Ta2P2Se6,0.294,0.000,0.000,-,0.813,0.235,1.179,-0.799,-,...,-1.189,22.64,ABC3,897.669,6w,4037,P2Ta2Se6-e2c90519357b,cmr,-0.0,677.616
4031,Zr2P2Se6,0.394,0.000,0.000,-,1.074,0.741,1.006,-1.142,-,...,0.937,24.638,ABC3,718.222,6w,4038,P2Zr2Se6-2486f04ec8ea,cmr,0.0,755.942
4032,Mo2W2Se8,1.27,0.000,0.000,-,1.749,0.479,0.513,0.853,-,...,-0.002,90.448,ABC4,1191.348,6w,4039,Mo2W2Se8-a1d716aad84d,cmr,-0.0,700.669


## Tratamento do dataset

Para realizar a previsão de band gap, precisaremos apenas de alguns parâmetros específicos. Dessa forma, retiraremos todas as colunas que não serão utilizadas:

In [29]:
df.drop(columns=['2D plasma frequency (x)              ✕',
       '2D plasma frequency (y)              ✕',
       'Band gap (G₀W₀)              ✕', 'Band gap (HSE06)              ✕',
       'Conduction band effective mass, direction 1              ✕',
       'Conduction band effective mass, direction 2              ✕',
       'Cond. band minimum              ✕',
       'Conduction band minimum (G₀W₀)              ✕',
       'Conduction band minimum (HSE06)              ✕',
       'Dir. band gap              ✕', 'Direct band gap (G₀W₀)              ✕',
       'Direct band gap (HSE06)              ✕',
       'Dir. gap wo. soc.              ✕', 'Exc. bind. energy              ✕',
       'Gap wo. soc.              ✕',
       'Phonon dynamic stability (low/high)              ✕', 'Valence band effective mass, direction 1              ✕',
       'Valence band effective mass, direction 2              ✕',
       'Val. band maximum              ✕',
       'Valence band maximum (G₀W₀)              ✕',
       'Valence band maximum (HSE06)              ✕',
       'First class material              ✕', 'Calculator              ✕',
       'Anisotropic exchange (out-of-plane)              ✕',
       'Area of unit-cell              ✕', 'Topology              ✕',
       'Crystal type              ✕', 'DOS at ef              ✕',
       'DOS at ef no soc.              ✕',
       'Energy above convex hull              ✕', 'Fermi level              ✕', 'Magnetic anisotropy (E<sub>z</sub> - E<sub>x</sub>)              ✕',
       'Magnetic anisotropy (E<sub>z</sub> - E<sub>y</sub>)              ✕',
       'Magnetic easy axis              ✕', 'Magnetic state              ✕', 'Material class              ✕',
       'Material has inversion symmetry              ✕',
       'Magnetic              ✕', 'Material unique ID              ✕',
       'Maximum force              ✕', 'Maximum stress              ✕',
       'Maximum value of S_z at magnetic sites              ✕',
       'Minimum eigenvalue of Hessian              ✕',
       'Monolayer reported DOI              ✕',
       'Nearest neighbor exchange coupling              ✕',
       'Charge              ✕', 'Number of atoms              ✕',
       'Number of nearest neighbors              ✕', 'n-spins              ✕',
       'Out-of-plane dipole along +z axis              ✕',
       'Path to collection folder              ✕', 'PBC              ✕',
       'Point group              ✕', 'Unique ID              ✕',
       'Related COD id              ✕', 'Related ICSD id              ✕',
       'Single-ion anisotropy (out-of-plane)              ✕',
       'Soc. total energy, x-direction              ✕',
       'Soc. total energy, y-direction              ✕',
       'Soc. total energy, z-direction              ✕',
       'Space group              ✕', 'Speed of sound (x)              ✕',
       'Speed of sound (y)              ✕',
       'Static interband polarizability (x)              ✕',
       'Static interband polarizability (y)              ✕',
       'Static interband polarizability (z)              ✕',
       'Static lattice polarizability (x)              ✕',
       'Static lattice polarizability (y)              ✕',
       'Static lattice polarizability (z)              ✕',
       'Static total polarizability (x)              ✕',
       'Static total polarizability (y)              ✕',
       'Static total polarizability (z)              ✕',
       'Stiffness dynamic stability (low/high)              ✕',
       'Stiffness tensor, 11-component              ✕',
       'Stiffness tensor, 12-component              ✕',
       'Stiffness tensor, 13-component              ✕',
       'Stiffness tensor, 21-component              ✕',
       'Stiffness tensor, 22-component              ✕',
       'Stiffness tensor, 23-component              ✕',
       'Stiffness tensor, 31-component              ✕',
       'Stiffness tensor, 32-component              ✕',
       'Stiffness tensor, 33-component              ✕',
       'Stoichiometry              ✕', 'Mass              ✕',
       'Age              ✕', 'ID              ✕',
       'Unique identifier              ✕', 'Username              ✕',
       'Vacuum level difference              ✕', 'Vacuum level              ✕', 'Magnetic moment              ✕'], inplace=True)

Renomeando as colunas do dataset:

In [30]:
df.rename(columns={
'Formula              ✕': 'Formula', 
'Band gap              ✕': 'Band gap',
'Thermodynamic stability level              ✕': 'Thermodynamic stability level', 
'Energy              ✕': 'Energy',
'Work function (avg. if finite dipole)              ✕': 'Work function', 
'Heat of formation              ✕': 'Heat of formation',
'Space group number              ✕': 'Space group number',
'Volume              ✕': 'Volume of unit cell',
}, inplace=True)

## Retirando as linhas com dados faltantes

Já que temos uma quantidade considerável de dados e poucas linhas com dados faltantes, vamos simplesmente dropar as linhas que apresentem algum item faltante, não vamos usar nehuma técnica de preenchimento artificial ou algo do gênero.

In [31]:
# Dropping das linhas que contêm "-"
df_sem_hifen = df.drop(df[df.eq("-").any(axis=1)].index)
df = df_sem_hifen

# remove linhas com células vazia
df = df.dropna()  

In [32]:
df

Unnamed: 0,Formula,Band gap,Thermodynamic stability level,Energy,Work function,Heat of formation,Space group number,Volume of unit cell
0,Be4,0.0,1,-13.110,5.102,0.425,67,256.436
1,As4O8,3.232,3,-72.425,6.94,-1.065,31,1411.694
2,Ca4As4,0.998,2,-32.647,2.781,-0.743,14,1426.071
3,Fe4S8,0.0,2,-70.802,5.0,-0.168,7,1080.629
4,In2Se2,1.63,3,-14.491,4.59,-0.500,12,761.622
...,...,...,...,...,...,...,...,...
4029,Rh2P2S6,0.0,3,-52.474,4.675,-0.298,162,580.987
4030,Ta2P2Se6,0.294,2,-54.430,4.853,-0.317,2,677.616
4031,Zr2P2Se6,0.394,3,-53.151,4.916,-0.672,2,755.942
4032,Mo2W2Se8,1.27,3,-81.079,4.417,-0.653,25,700.669


## Particionando as fórmulas moleculares

Como os dados apresentam fórmulas químicas, é interessante particioná-las para oferecer uma maior capacidade de capturar informações específicas de cada parte da fórmula, podendo melhorar a qualidade e a interpretabilidade do modelo.

In [33]:
def extract_elements(formula):
    elements = re.findall(r'([A-Z][a-z]*)(\d*)', formula)
    return dict((el, int(num) if num else 1) for el, num in elements)

df['Elementos'] = df['Formula'].apply(extract_elements)
df_elementos = pd.DataFrame(df['Elementos'].tolist()).fillna(0)

df = df.drop(columns=['Elementos'])

df_elementos

Unnamed: 0,Be,As,O,Ca,Fe,S,In,Se,Sc,V,...,Os,Hg,Ir,Mo,Re,Rh,Ru,Y,Cs,K
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


Um parâmetro que pode influenciar de forma considerável na previsão e que não constava no dataset é a eletronegatividade. Assim, optou-se por incluir manualmente essa coluna ao conjunto de dados. Contudo, a eletronegatividade é um parâmetro atômico, portanto, para incluí-la utilizou-se uma expressão que calcula a eletronegatividade média da molécula considerando a contribuição de cada elemento com base em seu número de átomos e sua eletronegatividade:

$$\chi_m = \frac{\sum_i n_i \cdot \chi_i}{\sum_i n_i} $$

em que

$\chi_m$ e $\chi_i$ são as eletronegatividades da molécula e do átomo, respectivamente;

e $n_i$ é o número de átomos.

## Eletronegatividades

Ao perceber que para o escobo do nosso problema a eletronegatividade pode ser um fator de grande influência, resolvemos add a nossa rede.

In [34]:
eletronegatividades = {
 'H': 2.2,
 'Li': 0.98,
 'Be': 1.57,
 'B': 2.04,
 'C': 2.55,
 'N': 3.04,
 'O': 3.44,
 'F': 3.98,
 'Na': 0.93,
 'Mg': 1.31,
 'Al': 1.61,
 'Si': 1.9,
 'P': 2.19,
 'S': 2.58,
 'Cl': 3.16,
 'K': 0.82,
 'Ca': 1.0,
 'Sc': 1.36,
 'Ti': 1.54,
 'V': 1.63,
 'Cr': 1.66,
 'Mn': 1.55,
 'Fe': 1.83,
 'Co': 1.88,
 'Ni': 1.91,
 'Cu': 1.9,
 'Zn': 1.65,
 'Ga': 1.81,
 'Ge': 2.01,
 'As': 2.18,
 'Se': 2.55,
 'Br': 2.96,
 'Rb': 0.82,
 'Sr': 0.95,
 'Y': 1.22,
 'Zr': 1.33,
 'Nb': 1.6,
 'Mo': 2.16,
 'Ru': 2.2,
 'Rh': 2.28,
 'Pd': 2.2,
 'Ag': 1.93,
 'Cd': 1.69,
 'In': 1.78,
 'Sn': 1.96,
 'Sb': 2.05,
 'Te': 2.1,
 'I': 2.66,
 'Cs': 0.79,
 'Ba': 0.89,
 'Hf': 1.3,
 'Ta': 1.5,
 'W': 2.36,
 'Re': 1.9,
 'Os': 2.2,
 'Ir': 2.2,
 'Pt': 2.28,
 'Au': 2.54,
 'Hg': 2.0,
 'Tl': 1.62,
 'Pb': 2.33,
 'Bi': 2.02,
}

{{{{{{{{Explicar aqui como, matematicamente e humanamente, como o cálculo da eletronegatividade foi feito}}}}}}}}}}}

In [35]:
eletronegatividade_molecula = []

for indice, linha in df_elementos.iterrows():
    eletronegatividade = 0
    soma_elementos = linha.sum()

    for elemento, quantidade in linha.items():
        valor_eletronegatividade = eletronegatividades[elemento]
        contribuicao = (quantidade * valor_eletronegatividade) / soma_elementos
        eletronegatividade += contribuicao

    eletronegatividade_molecula.append(eletronegatividade)

In [36]:
# add a eletronegatividade respectiva de cada molécula

df['Electronegativity'] = eletronegatividade_molecula

Juntando os dados particionados ao dataset:

In [37]:
df = df.join(df_elementos, how='right')

Retirando a coluna "Formula", já que agora está escrita com base nos elementos.

In [38]:
df_sem_formula = df.drop('Formula', axis=1)
df = df_sem_formula

## Salvando em arquivo do dataset, no formato .csv <br>(processamento inicial finalizado)

In [39]:
df.to_csv('dataset_tratado.csv', index=False)

In [40]:
df

Unnamed: 0,Band gap,Thermodynamic stability level,Energy,Work function,Heat of formation,Space group number,Volume of unit cell,Electronegativity,Be,As,...,Os,Hg,Ir,Mo,Re,Rh,Ru,Y,Cs,K
0,0.0,1.0,-13.110,5.102,0.425,67.0,256.436,1.570,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.232,3.0,-72.425,6.94,-1.065,31.0,1411.694,3.020,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.998,2.0,-32.647,2.781,-0.743,14.0,1426.071,1.590,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2.0,-70.802,5.0,-0.168,7.0,1080.629,2.330,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.63,3.0,-14.491,4.59,-0.500,12.0,761.622,2.165,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,1.02,3.0,-38.916,5.497,-0.195,162.0,647.735,2.386,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4028,0.028,3.0,-51.958,5.223,-0.227,162.0,544.497,2.368,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4029,0.0,3.0,-52.474,4.675,-0.298,162.0,580.987,2.442,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4030,0.294,2.0,-54.430,4.853,-0.317,2.0,677.616,2.268,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


## Separando features e targets

In [43]:
features = ["Thermodynamic stability level", "Energy", "Work function", "Heat of formation", "Space group number", "Volume of unit cell", "Electronegativity", "Be", "As", "O", "Ca", "Fe", "S", "In","Se","Sc","V","Zr","B","H","Te","Al","Mg","Ba","Pb","Mn","Si","Cr","Br","Ga","Hf","Ge","Ti","C","I","Li","Cl","Sr","Na","Nb","Ni","Ta","Pd","Pt","Tl","W","Sb","N","Cd","Cu","Sn","F","P","Ag","Au","Bi","Co","Zn","Rb","Os","Hg","Ir","Mo","Re","Rh","Ru","Y","Cs","K"]
target = ["Band gap"]

df = df.reindex(FEATURES + TARGET, axis=1)

## Separando conjunto de teste

In [44]:
indices = df.index
indices_treino_val, indices_teste = train_test_split(
    indices, test_size=TAMANHO_TESTE, random_state=SEMENTE_ALEATORIA
)

df_treino_val = df.loc[indices_treino_val]
df_teste = df.loc[indices_teste]

X_teste = df_teste.reindex(features, axis=1).values
y_teste = df_teste.reindex(target, axis=1).values

In [51]:
y_teste

array(['0.0', '1.167', '0.0', '0.0', '0.000', '0.0', '0.1', '2.278',
       '0.337', '4.777', '0.0', '0.0', '0.043', '0.0', '0.000', '0.0',
       '0.0', '2.063', '0.0', '0.0', '0.0', '4.454', '0.0', '0.0',
       '0.863', '0.000', '0.000', '0.0', '0.0', '0.0', '0.0', '2.385',
       '0.0', '1.037', '0.0', '0.0', '0.000', '0.0', '0.182', '0.0',
       '0.0', '0.461', '1.094', '2.667', '0.0', '0.0', '0.0', '1.271',
       '2.007', '0.226', '0.0', '0.0', '1.522', '1.865', '0.0', '0.0',
       '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '0.195', '0.199', '1.837', '1.02', '2.033', '0.711', '0.478',
       '0.000', '0.0', '3.389', '0.0', '0.000', '0.0', '1.338', '0.0',
       '1.202', '0.134', '0.000', '3.465', '1.302', '0.0', '0.0', '0.75',
       '1.393', '0.0', '0.0', '0.000', '0.0', '0.0', '0.506', '0.0',
       '0.0', '0.0', '0.0', '0.0', '0.0', '2.18', '0.000', '0.000',
       '1.308', '0.0', '0.062', '1.313', '0.0', '0.0', '0.0', '0.0',
       '0.0', '0.000'

## Separando conjunto de treino e validação

In [49]:
indices = df_treino_val.index
indices_treino, indices_val = train_test_split(
    indices, test_size=TAMANHO_TESTE, random_state=SEMENTE_ALEATORIA
)

df_treino = df.loc[indices_treino]
df_val = df.loc[indices_val]

X_treino = df_treino.reindex(features, axis=1).values
y_treino = df_treino.reindex(target, axis=1).values

X_val = df_val.reindex(features, axis=1).values
y_val = df_val.reindex(target, axis=1).values

## Normalização de dados:

Usamos o normalizador pelo máximo absoluto p/ não alterar a esparcidade de dados. (p/ gente a questão de 0 é importante ter. Pq os 0 das quant. de elementos é bem interessante que seja realmente 0.)<br>
`A normalização é feita apenas no conjunto de treino!`

In [52]:
x_scaler = MaxAbsScaler()
x_scaler.fit(X_treino)

y_scaler = MaxAbsScaler()
y_scaler.fit(y_treino)

X_treino = x_scaler.transform(X_treino)
y_treino = y_scaler.transform(y_treino)

X_val = x_scaler.transform(X_val)
y_val = y_scaler.transform(y_val)

X_teste = x_scaler.transform(X_teste)

# apontou erro de shape, tive que reshapar
y_teste = y_teste.reshape(-1, 1)
y_teste = y_scaler.transform(y_teste)

O PyTorch precisa que os dados estejam em formato de tensores. Então:

In [53]:
X_treino = torch.tensor(X_treino, dtype=torch.float32)
y_treino = torch.tensor(y_treino, dtype=torch.float32)

X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)

X_teste = torch.tensor(X_teste, dtype=torch.float32)
y_teste = torch.tensor(y_teste, dtype=torch.float32)

## DataModule (simplificado)

Criandouma instância da classe L.LightningDataModule que será utilizada durante o treinamento da rede neural pelo PyTorch Lightning.

In [54]:
class DataModule(L.LightningDataModule):
    def __init__(
        self,
        X_treino,
        y_treino,
        X_val,
        y_val,
        X_teste,
        y_teste,
        tamanho_lote = 256,
        num_trabalhadores = 2,
    ):
        super().__init__()

        self.tamanho_lote = tamanho_lote
        self.num_trabalhadores = num_trabalhadores

        self.X_treino = X_treino
        self.y_treino = y_treino
        self.X_val = X_val
        self.y_val = y_val
        self.X_teste = X_teste
        self.y_teste = y_teste

    def train_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_treino, self.y_treino),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

    def val_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_val, self.y_val),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

    def test_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_teste, self.y_teste),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

## DataModule (completo)

In [55]:
class DataModule(L.LightningDataModule):
    def __init__(
        self,
        tamanho_lote=256,
        num_trabalhadores=2,
    ):
        super().__init__()

        self.tamanho_lote = tamanho_lote
        self.num_trabalhadores = num_trabalhadores

    def prepare_data(self):
        """Local onde se faz o download do dataset. No caso do `seaborn`, ele
        deve baixar o dataset quando se usa pela primeira vez e por isso
        incluímos este código aqui.

        Não altere o estado da sua classe neste método!

        Se você não precisa fazer download de um dataset, pode eliminar este
        método.

        """
        sns.load_dataset("penguins")

    def setup(self, stage):
        """Ocorre após o `prepare_data`. Aqui devemos alterar o estado da classe
        para adicionar as informações referentes aos conjuntos de treino, teste
        e validação. O argumento `stage` deve existir e ele indica em qual
        estágio o processo de treino está (pode ser `fit` para
        treinamento/validação e `test` para teste).

        É nesta etapa onde aplicamos transformações aos dados caso necessário.

        """
        features = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm"]
        target = ["body_mass_g"]

        df = sns.load_dataset("penguins")

        df = df.reindex(features + target, axis=1)
        df = df.dropna()

        indices = df.index
        indices_treino_val, indices_teste = train_test_split(
            indices, test_size=TAMANHO_TESTE, random_state=SEMENTE_ALEATORIA
        )

        df_treino_val = df.loc[indices_treino_val]
        df_teste = df.loc[indices_teste]

        indices = df_treino_val.index
        indices_treino, indices_val = train_test_split(
            indices,
            test_size=TAMANHO_TESTE,
            random_state=SEMENTE_ALEATORIA,
        )

        df_treino = df.loc[indices_treino]
        df_val = df.loc[indices_val]

        X_treino = df_treino.reindex(features, axis=1).values
        y_treino = df_treino.reindex(target, axis=1).values

        self.x_scaler = MaxAbsScaler()
        self.x_scaler.fit(X_treino)

        self.y_scaler = MaxAbsScaler()
        self.y_scaler.fit(y_treino)

        if stage == "fit":
            X_val = df_val.reindex(features, axis=1).values
            y_val = df_val.reindex(target, axis=1).values

            X_treino = self.x_scaler.transform(X_treino)
            y_treino = self.y_scaler.transform(y_treino)

            X_val = self.x_scaler.transform(X_val)
            y_val = self.y_scaler.transform(y_val)

            self.X_treino = torch.tensor(X_treino, dtype=torch.float32)
            self.y_treino = torch.tensor(y_treino, dtype=torch.float32)

            self.X_val = torch.tensor(X_val, dtype=torch.float32)
            self.y_val = torch.tensor(y_val, dtype=torch.float32)

        if stage == "test":
            X_teste = df_teste.reindex(features, axis=1).values
            y_teste = df_teste.reindex(target, axis=1).values

            X_teste = self.x_scaler.transform(X_teste)
            y_teste = self.y_scaler.transform(y_teste)

            self.X_teste = torch.tensor(X_teste, dtype=torch.float32)
            self.y_teste = torch.tensor(y_teste, dtype=torch.float32)

    def train_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_treino, self.y_treino),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

    def val_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_val, self.y_val),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

    def test_dataloader(self):
        return DataLoader(
            TensorDataset(self.X_teste, self.y_teste),
            batch_size=self.tamanho_lote,
            num_workers=self.num_trabalhadores,
        )

## Criando a rede neural com PyTorch Lightning

In [56]:
class MLP(L.LightningModule):
    def __init__(
        self, num_dados_entrada, neuronios_c1, neuronios_c2, num_targets
    ):
        super().__init__()

        self.camadas = nn.Sequential(
            nn.Linear(num_dados_entrada, neuronios_c1),
            nn.Sigmoid(),
            nn.Linear(neuronios_c1, neuronios_c2),
            nn.Sigmoid(),
            nn.Linear(neuronios_c2, num_targets),
        )

        self.fun_perda = F.mse_loss

        self.perdas_treino = []
        self.perdas_val = []

        self.curva_aprendizado_treino = []
        self.curva_aprendizado_val = []

    def forward(self, x):
        x = self.camadas(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.fun_perda(y, y_pred)

        self.log("loss", loss, prog_bar=True)
        self.perdas_treino.append(loss)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.fun_perda(y, y_pred)

        self.log("val_loss", loss, prog_bar=True)
        self.perdas_val.append(loss)

        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.fun_perda(y, y_pred)

        self.log("test_loss", loss)

        return loss

    def on_train_epoch_end(self):
        # Atualiza curva de aprendizado
        perda_media = torch.stack(self.perdas_treino).mean()
        self.curva_aprendizado_treino.append(float(perda_media))
        self.perdas_treino.clear()

    def on_validation_epoch_end(self):
        # Atualiza curva de aprendizado
        perda_media = torch.stack(self.perdas_val).mean()
        self.curva_aprendizado_val.append(float(perda_media))
        self.perdas_val.clear()

    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(), lr=1e-3)
        return optimizer

## Treinando a rede

In [57]:
# criando um treinador

treinador = L.Trainer(max_epochs=NUM_EPOCAS)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [58]:
# Criando uma instância da classe DataModule.

dm = DataModule()

In [63]:
# Criando uma instância da rede neural.

num_dados_de_entrada = 69
num_dados_de_saida = 1
neuronios_c1 = 3
neuronios_c2 = 2

minha_mlp = MLP(
    num_dados_de_entrada, neuronios_c1, neuronios_c2, num_dados_de_saida
)

In [64]:
# Podemos rodar o método fit do nosso treinador na nossa instância da rede neural, fornecendo também uma instância do DataModule como argumento.

treinador.fit(minha_mlp, dm)


  | Name    | Type       | Params
---------------------------------------
0 | camadas | Sequential | 1.2 K 
---------------------------------------
1.2 K     Trainable params
0         Non-trainable params
1.2 K     Total params
0.005     Total estimated model params size (MB)


Sanity Checking: |                                                                               | 0/? [00:00<…

c:\venv\ilumpy\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


RuntimeError: mat1 and mat2 shapes cannot be multiplied (31x3 and 69x15)

## Curva de aprendizado

In [None]:
figura, eixo = plt.subplots()

ca_treino = minha_mlp.curva_aprendizado_treino
ca_val = minha_mlp.curva_aprendizado_val

eixo.plot(ca_treino, label="Treino")
eixo.plot(ca_val, label="Validação")

eixo.legend()

eixo.set_xlim(left=0)

eixo.set_title("Curva de aprendizado")
eixo.set_xlabel("Época")
eixo.set_ylabel("Loss");

## Testando a rede

Antes de testar a rede neural que treinamos precisamos deixá-la no modo de avaliação.

In [None]:
minha_mlp.eval()

Antes de computar a métrica, temos que configurar a instância do DataModule no modo teste, criar um ambiente onde não computamos os gradientes locais (ambiente no_grad), computar os valores previstos do target e, finalmente, desnormalizar os dados com o inverse_transform.

In [None]:
dm.setup("test")

with torch.no_grad():
    X_true = dm.X_teste

    y_true = dm.y_teste
    y_true = dm.y_scaler.inverse_transform(y_true)

    y_pred = minha_mlp(X_true)
    y_pred = dm.y_scaler.inverse_transform(y_pred)

    RMSE = mean_squared_error(y_true, y_pred, squared=False)

    print(RMSE)

## Salvando o modelo

Salvando apenas os pesos e vieses da rede neural em um arquivo no seu computador. Para isso, podemos utilizar o módulo pickle.

In [None]:
nome_arquivo = "meu_modelo.p"
pickle.dump(minha_mlp.state_dict(), open(nome_arquivo, "wb"))

# Referências

1. CASSAR, D. R. PyTorch Lightning. (2024)
2. CASSAR, D. R.Treinando uma rede neural com pytorch. (2024)
3. CASSAR, D. R. Redes neurais artificiais do zero em Python. (2024)
4. CASSAR, D. R. Avaliação de modelos: a estratégia de divisão entre treino e teste. (2023)
5. CASSAR, D. R. Transformação e normalização. (2023)
6. CASSAR, D. R. Conversão simbólico-numérico. (2023)
7. ChatGPT para ajuda na resolução de bugs.