<a href="https://colab.research.google.com/github/renatomenendes/AnaliseSobrevivencia/blob/main/Simulador_master_table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Configuração no pandas
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Removendo formatação cientifica
pd.options.display.float_format = '{:.1f}'.format
np.random.seed(42)

In [None]:
def score_product_group(row):
    if row['product_group'] == 'S10':
        return 2
    elif row['product_group'] == 'S500':
        return 1
    elif row['product_group'].startswith('OC_') or row['product_group'] == 'MARITMO':
        return 0
    elif row['product_group'] == 'OUTROS':
        if any(keyword in row['product_name'] for keyword in ['S10', 'S500', 'GRID']):
            if 'S10' in row['product_name']:
                return 2
            elif 'S500' in row['product_name']:
                return 1
        return 0
    return 0

def score_product_class(row):
    if row['product_class'] == 'Top Premium':
        return 3
    elif row['product_class'] == 'Premium':
        return 2
    elif row['product_class'] == 'Convencional':
        return 1
    return 0

In [None]:
def calculate_pricing(df, replenishment_cost_col, nominal_discount_col, delta_cost_col, pace_col, minimum_margin_col, standard_neg_value_col):
    # Limitar 'pace' entre -2 e 2 e garantir um mínimo para evitar divisão por zero
    df[pace_col] = df[pace_col].clip(-2, 2)
    pace_abs = np.abs(df[pace_col]).clip(lower=0.1)

    # Ajustar 'delta_cost' baseado em 'pace' e 'delta_cost'
    df['adjusted_delta_cost'] = np.where(
        df[pace_col] > 0,
        np.where(
            df[delta_cost_col] > 0,
            df[delta_cost_col] / pace_abs,  # Suavizar aumento de custo para vendas atrasadas
            np.where(
                df[delta_cost_col] < 0,
                df[delta_cost_col],  # Repassar redução integralmente
                0  # Delta_cost zero, sem alteração
            )
        ),
        np.where(
            df[pace_col] < 0,
            np.where(
                df[delta_cost_col] > 0,
                df[delta_cost_col] * pace_abs,  # Aumentar custo proporcionalmente para vendas aceleradas
                np.where(
                    df[delta_cost_col] < 0,
                    df[standard_neg_value_col],  # Não repassar reduções, usar valor negativo padrão
                    0  # Delta_cost zero, sem alteração
                )
            ),
            np.where(
                df[delta_cost_col] > 0,
                df[delta_cost_col],  # Aumentar preço proporcionalmente para vendas estáveis
                np.where(
                    df[delta_cost_col] < 0,
                    df[standard_neg_value_col],  # Não repassar reduções, usar valor negativo padrão
                    0  # Delta_cost zero, sem alteração
                )
            )
        )
    )

    potential_price = df[replenishment_cost_col] + df[nominal_discount_col] + df['adjusted_delta_cost']
    minimum_price = df[replenishment_cost_col] + df[minimum_margin_col]
    df['_initial_price'] = np.maximum(potential_price, minimum_price)
    df['_initial_price'] = np.maximum(df['_initial_price'], df[standard_neg_value_col])
    return df

In [None]:
# Gerando dados
start_date = datetime(2024, 1, 1)
end_date = datetime.now()
date_range = pd.date_range(start_date, end_date, freq='D').to_pydatetime().tolist()
n_clients = 5_000
n_depots = 80
n_products = 20
n_samples = 10_000
n_uf = 27
n_regions = 5
n_cities = 4400
standard_neg = 10
n_components = 10
epochs = 100
noise_dim = 100
target_variable = 'reference_price'

client_ids = ['client_' + str(i) for i in range(n_clients)]
#depot_ids = load_table("processed","depot_register").query("depot in @depots_to_price")['depot'].tolist()
#product_ids = load_table("processed","product_register")['product'].tolist()
depot_ids = ['depot_' + str(i) for i in range(n_depots)]
product_ids = ['product_' + str(i) for i in range(n_products)]
region_ids = ['depot_region_' + str(i) for i in range(n_regions)]
state_ids = ['depot_state_' + str(i) for i in range(n_uf)]
city_ids = ['depot_city_' + str(i) for i in range(n_cities)]
product_group = ['s10', 's500', 'MARÍTIMO', 'OC_A', 'OC_B', 'OUTROS']
product_class = ['convencional', 'Premium', 'Top Premium']
group_assignments = np.random.choice(product_group, n_products)
class_assignments = np.random.choice(product_class, n_products)
product_to_group = dict(zip(product_ids, group_assignments))
product_to_class = dict(zip(product_ids, class_assignments))

granular_cols = ['client','depot','product']

In [None]:
def simulate_data(n_samples):
    data = {
        'date': np.random.choice(date_range, n_samples),
        'client': np.random.choice(client_ids, n_samples),
        'product': np.random.choice(product_ids, n_samples),
        'depot': np.random.choice(depot_ids, n_samples),
        'depot_region': np.random.choice(region_ids, n_samples),
        'depot_state': np.random.choice(state_ids, n_samples),
        'depot_city': np.random.choice(city_ids, n_samples),
        'distribution_channel': np.random.choice([10, 11], n_samples),
        'forecast': np.random.randint(10, 1000, n_samples),
        'volume_m3': np.random.randint(10, 1000, n_samples),
        'days_without_purchase': np.random.randint(1, 5, n_samples),
        'purchase_frequency': np.ceil(15 / np.random.randint(1, 15, n_samples)),
        'pace': np.random.normal(-2.0, 2.0, n_samples),
        # 'is_buying': np.random.choice([0, 1], n_samples),
        'minimum_margin': np.random.randint(-400, 401, n_samples),
        'unitary_margin': np.random.randint(-400, 401, n_samples),
        'previous_replenishment_cost': np.random.randint(3500, 7000, n_samples),
        'nominal_discount': np.random.randint(-100, 401, n_samples),
    }

    # Simulando margens e preços coerentes
    data['standard_neg'] = standard_neg
    data['replenishment_cost'] = data['previous_replenishment_cost'] + np.random.randint(-10, 11, n_samples)
    data['reference_price'] = data['replenishment_cost'] + data['unitary_margin'] + standard_neg
    data['unitary_margin'] = data['reference_price'] - data['replenishment_cost'] - standard_neg
    data['delta_cost'] = (data['replenishment_cost'] - data['previous_replenishment_cost'])
    data['pace'] = data['pace'].clip(-2, 2)
    data['unitary_cost'] = data['reference_price'] / data['volume_m3']
    data['is_buying'] = np.where(data['days_without_purchase'] <= data['purchase_frequency'],1,0)

    # Margens e preços comparativos de grupo
    price_addition = np.random.randint(100, 1001, n_samples)
    data['comparable_group_price'] = data['replenishment_cost'] + price_addition
    data['min_corridor_price'] = data['replenishment_cost'] + price_addition
    data['max_corridor_price'] = data['min_corridor_price'] + np.random.randint(0, 601, n_samples)

    # Margens de corredor seguras
    low = data['minimum_margin'] + 1
    high = data['minimum_margin'] + np.random.randint(0, 101, n_samples)  # Garante que max_margin >= min_margin
    data['min_corridor_margin'] = np.minimum(low, high - 1)  # Garante que min_margin < max_margin
    data['max_corridor_margin'] = np.maximum(high, data['min_corridor_margin'] + 1)

    # Adicionar variações normais para margens anteriores
    adjustment_factor = np.random.normal(loc=0.0, scale=20.0, size=n_samples)
    data['previous_margin'] = data['unitary_margin'] + adjustment_factor

    # Convertendo para DataFrame
    data = pd.DataFrame(data)

    # Mapeamento de grupos e classes para produtos
    data['product_group'] = data['product'].map(product_to_group)
    data['product_class'] = data['product'].map(product_to_class)

    # Processamento adicional de datas e dias da semana
    weekdays = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
    data = data.assign(
        year_month=lambda df: df["date"].dt.strftime("%Y-%m"),
        nweekday=lambda df: df["date"].dt.weekday,
        weekday=lambda df: df["nweekday"].apply(lambda x: weekdays[x])
    )

    return data

In [None]:
market_data = simulate_data(n_samples)

In [None]:
df = market_data.pipe(calculate_pricing,
                      'replenishment_cost',
                      'nominal_discount',
                      'delta_cost',
                      'pace',
                      'minimum_margin',
                      'standard_neg').assign(diff_price = lambda x:x['_initial_price'] - x['reference_price'],
                                             diff_margin = lambda x:(x['_initial_price']-x['replenishment_cost']+x['adjusted_delta_cost'])-(x['reference_price']-x['replenishment_cost']),
                                             inferior_limit = lambda x: (x['replenishment_cost']+x['minimum_margin']+x['standard_neg']),
                                             )

# df.head()[['reference_price',
#              'replenishment_cost',
#              'delta_cost',
#              'adjusted_delta_cost',
#              'nominal_discount',
#              'minimum_margin',
#              'standard_neg',
#              'volume_m3',
#              'forecast',
#              'pace',
#              'inferior_limit',
#              '_initial_price',
#              'diff_price',
#              'diff_margin',
#             ]]

In [None]:
# Definindo características
date_features = ['date', 'year_month', 'nweekday', 'weekday']
categorical_features = ["client", "product_group", "product_class", "product",  "depot_region", "depot_state",
                        "depot_city", "depot",'distribution_channel','is_buying']
numeric_features = ['reference_price','_initial_price','comparable_group_price',
                    'previous_replenishment_cost','replenishment_cost',
                    'unitary_cost', 'delta_cost', 'adjusted_delta_cost','nominal_discount',
                    'max_corridor_margin','minimum_margin','min_corridor_margin',
                    'max_corridor_price','inferior_limit','min_corridor_price',
                    'forecast', 'volume_m3', 'days_without_purchase', 'purchase_frequency', 'pace',
                    'previous_margin','unitary_margin',
                    'standard_neg','diff_price', 'diff_margin','_initial_price'
                    ]

result_columns = ['inferior_limit', 'diff_price', 'diff_margin','_initial_price',]

numeric_data_columns = [col for col in numeric_features if col not in result_columns]
numeric_data = df[numeric_data_columns]

#Organizar o DataFrame
columns = [col for col in numeric_data.columns if col != target_variable] + [target_variable]
numeric_data = numeric_data[columns]

# Seleção de colunas e Preparação dos dados
df['date'] = pd.to_datetime(df['date'])
df.fillna(df[numeric_features].mean(), inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
# Normalizando os dados
scaler = RobustScaler()
numeric_data_scaled = scaler.fit_transform(numeric_data)

# Aplicando PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(numeric_data_scaled)

# Criando um novo DataFrame com os componentes principais e 'gross_revenue'
columns = ['PC{}'.format(1+i) for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
pca_df[target_variable] = df[target_variable].copy()

# Calcular a variação explicada após o ajuste do PCA na primeira abordagem
explained_variance = pca.explained_variance_ratio_

print("Variância explicada pelos primeiros componentes principais:", sum(pca.explained_variance_ratio_))
# correlation_with_volume = pca_df.corr()[target_variable].sort_values(ascending=False)
# print(correlation_with_volume)

# Visualização da variação explicada pelo PCA
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(explained_variance) + 1), np.cumsum(explained_variance), where='mid', label='Cumulative explained variance')
plt.ylabel('Ratio of explained variance')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# Obtendo os pesos dos componentes principais
components_weights = pca.components_
# Criar um DataFrame para visualizar os pesos
weights_df = pd.DataFrame(components_weights, columns=numeric_data.columns)
# Exibir os pesos do componente principal desejado
weights_pc= weights_df.iloc[2].sort_values(ascending=False)
significant_fields = weights_pc.drop(target_variable).index.tolist()
significant_df = df[significant_fields]
display(weights_pc)

In [None]:
# Pipeline para colunas numéricas com PCA
numeric_transformer = Pipeline([
                                ('imputer', SimpleImputer(strategy='mean')),
                                ('scaler', StandardScaler()),
                                ('pca', PCA(n_components=10))  # Usando 10 componentes para uma análise mais detalhada
                             ])

# Pipeline para colunas categóricas
categorical_transformer = Pipeline([
                                    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                  ])

# Combinar transformadores
preprocessor = ColumnTransformer([
                                  ('num', numeric_transformer, numeric_data_columns),
                                  ('cat', categorical_transformer, categorical_features)
                                ])

# Pré-processamento e divisão dos dados
X = preprocessor.fit_transform(df[numeric_data_columns + categorical_features])
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Acessar o componente PCA dentro do pipeline e obter a variação explicada
explained_variance = preprocessor.named_transformers_['num'].named_steps['pca'].explained_variance_ratio_

print("Variância explicada pelos primeiros componentes principais:", sum(explained_variance))
# correlation_with_volume = pca_df.corr()[target_variable].sort_values(ascending=False)
# print(correlation_with_volume)

# Visualização da variação explicada pelo PCA
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(explained_variance) + 1),
        explained_variance,
        alpha=0.5,
        align='center',
        label='Individual explained variance')
plt.step(range(1, len(explained_variance) + 1),
         np.cumsum(explained_variance),
         where='mid',
         label='Cumulative explained variance')
plt.ylabel('Ratio of explained variance')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# Acessar o componente PCA dentro do pipeline
pca_component = preprocessor.named_transformers_['num'].named_steps['pca']
# Obter os pesos dos componentes principais
components_weights = pca_component.components_
# Criar um DataFrame para visualizar os pesos
weights_df = pd.DataFrame(components_weights, columns=numeric_data_columns)
# Exibir os pesos do componente principal desejado
weights_pc = weights_df.iloc[2].sort_values(ascending=False)
# Remover o campo target_variable da lista de colunas significativas
significant_fields = weights_pc.drop(target_variable).index.tolist()

# Criar o DataFrame significant_df usando apenas as colunas relevantes
significant_fields = weights_pc.index.tolist()
significant_df = df[significant_fields]

print(weights_pc)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, AdamW
from torch.nn import BCELoss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
# Definição do dataset
class PricingDataset(Dataset):
    def __init__(self, dataframe, target_variable):
        self.features = dataframe.drop(columns=target_variable).values.astype(np.float32)
        self.targets = dataframe[target_variable].values.astype(np.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx: int):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.targets[idx], dtype=torch.float)


In [None]:
# # Gerador
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(noise_dim, 256),
            nn.BatchNorm1d(256),  # Normalização do Batch
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),  # Normalização do Batch
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024),  # Normalização do Batch
            nn.LeakyReLU(0.2),
            nn.Linear(1024, output_dim),
            nn.ReLU()  # Função de Ativação ReLU
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# # Discriminador
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# Treinamento da GAN
def train_gan(data_loader,
              generator,
              discriminator,
              g_optimizer,
              d_optimizer,
              criterion,
              epochs,
              noise_dim,
              device,):


    for epoch in range(epochs):
        for real_features, _ in data_loader:
            real_features = real_features.to(device)
            batch_size = real_features.size(0)
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)

            # Treinando o Discriminador
            d_optimizer.zero_grad()
            real_predictions = discriminator(real_features)
            d_loss_real = criterion(real_predictions, real_labels)

            noise = torch.randn(batch_size, noise_dim, device=device)
            fake_features = generator(noise)
            fake_predictions = discriminator(fake_features.detach())
            d_loss_fake = criterion(fake_predictions, fake_labels)

            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            d_optimizer.step()

            # Atualizar a taxa de aprendizado do discriminador
            d_scheduler.step()

            # Treinando o Gerador
            g_optimizer.zero_grad()
            fake_predictions = discriminator(fake_features)
            g_loss = criterion(fake_predictions, real_labels)
            g_loss.backward()
            g_optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, G Loss: {g_loss.item():.4f}, D Loss: {d_loss.item():.4f}')


In [None]:
# Configurações
input_dim = significant_df.drop(columns=[target_variable]).shape[1]

# Definindo as dimensões corretas
# generator = Generator(noise_dim=100, output_dim=input_dim)
# discriminator = Discriminator(input_dim=input_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = PricingDataset(dataframe=significant_df, target_variable=target_variable)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
output_dim = significant_df.drop(columns=[target_variable]).shape[1]
noise_dim = significant_df.drop(columns=[target_variable]).shape[1]

# Configuração do Gerador e do Discriminador
gen = Generator(input_dim=input_dim, output_dim=output_dim).to(device)
disc = Discriminator(input_dim=output_dim).to(device)

# Otimizadores
# g_optimizer = Adam(gen.parameters(), lr=0.0001, weight_decay=0.001)
# d_optimizer = Adam(disc.parameters(), lr=0.0001)  # Taxa de aprendizado padrão para o discriminador

# Ajuste Fino dos Hiperparâmetros
learning_rate_g = 0.0002  # Taxa de aprendizado do gerador reduzida
learning_rate_d = 0.0002  # Taxa de aprendizado do discriminador reduzida

g_optimizer = Adam(gen.parameters(), lr=learning_rate_g)
d_optimizer = Adam(disc.parameters(), lr=learning_rate_d)

# Regularização Adicional
weight_decay = 0.001  # Coeficiente de regularização L2

g_optimizer = AdamW(gen.parameters(), lr=learning_rate_g, weight_decay=weight_decay)
d_optimizer = AdamW(disc.parameters(), lr=learning_rate_d, weight_decay=weight_decay)

# Programação de taxa de aprendizado para o discriminador
d_scheduler = torch.optim.lr_scheduler.ExponentialLR(d_optimizer, gamma=0.95)  # Reduz a taxa de aprendizado do discriminador ao longo do tempo

criterion = BCELoss()
#criterion = nn.MSELoss()

In [None]:
def validate_model(generator,
                   discriminator,
                   data_loader,
                   criterion):
    generator.eval()
    discriminator.eval()
    total_d_loss = 0
    total_g_loss = 0
    with torch.no_grad():
        for features, _ in data_loader:
            batch_size = features.size(0)
            real_labels = torch.ones(batch_size, 1, device=device)
            fake_labels = torch.zeros(batch_size, 1, device=device)

            real_predictions = discriminator(features)
            d_loss_real = criterion(real_predictions, real_labels)
            noise = torch.randn(batch_size, input_dim, device=device)
            fake_features = generator(noise)
            fake_predictions = discriminator(fake_features)
            d_loss_fake = criterion(fake_predictions, fake_labels)
            d_loss = d_loss_real + d_loss_fake

            g_loss = criterion(discriminator(fake_features), real_labels)

            total_d_loss += d_loss.item()
            total_g_loss += g_loss.item()

    avg_d_loss = total_d_loss / len(data_loader)
    avg_g_loss = total_g_loss / len(data_loader)
    generator.train()
    discriminator.train()
    return avg_d_loss, avg_g_loss

In [None]:
# Treinamento
train_gan(data_loader,
          gen,
          disc,
          g_optimizer,
          d_optimizer,
          criterion,
          epochs=epochs,
          noise_dim=noise_dim,
          device=device
        )

In [None]:
# Verificação das dimensões após a preparação do dataset
print("Dimensão de entrada para o Discriminador:", significant_df.drop(columns=[target_variable]).shape[1])
print("Dimensão de saída do Gerador:", significant_df.shape[1] - 1)

In [None]:
# Chamando a função de validação após o treinamento
avg_d_loss, avg_g_loss = validate_model(gen, disc, data_loader, criterion)
print("Perda média do Discriminador na Validação:", avg_d_loss)
print("Perda média do Gerador na Validação:", avg_g_loss)

In [None]:
def plot_prices(real_data, generated_data):
    plt.figure(figsize=(10, 5))

    # Ajustando a dimensão dos dados, se necessário
    if real_data.ndim > 1:
        real_data = real_data.flatten()
    if generated_data.ndim > 1:
        generated_data = generated_data.flatten()

    plt.subplot(1, 2, 1)
    plt.hist(real_data, bins=50, density=True, alpha=0.6, color='g')
    plt.title('Distribuição de Preços Reais')

    plt.subplot(1, 2, 2)
    plt.hist(generated_data, bins=50, density=True, alpha=0.6, color='r')
    plt.title('Distribuição de Preços Gerados')
    plt.show()

def validate_and_plot(df, generator, device, noise_dim, num_samples=1000):
    # Extraindo amostras dos dados reais
    real_data = df.sample(n=num_samples).values
    real_data_tensor = torch.tensor(real_data, dtype=torch.float).to(device)

    # Gerando dados falsos
    noise = torch.randn(num_samples, noise_dim, device=device)
    generated_data = generator(noise)

    # Convertendo tensores para numpy e desacoplando do grafo de cálculo
    real_data_numpy = real_data_tensor.detach().cpu().numpy()
    generated_data_numpy = generated_data.detach().cpu().numpy()

    # Plotando os resultados
    plot_prices(real_data_numpy, generated_data_numpy)

# Usando a função
validate_and_plot(significant_df, gen, device, noise_dim)

In [None]:
def save_model(epoch, model, optimizer, path='model_checkpoint'):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, os.path.join(path, f'model_epoch_{epoch}.pth'))

In [None]:
def plot_prices(real_data, generated_data):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.hist(real_data.detach().numpy(), bins=50, density=True, alpha=0.6, color='g')
    plt.title('Distribuição de Preços Reais')
    plt.subplot(1, 2, 2)
    plt.hist(generated_data.detach().numpy(), bins=50, density=True, alpha=0.6, color='r')
    plt.title('Distribuição de Preços Gerados')
    plt.show()

In [None]:
def add_generated_prices(df, generator, device):
    generator.eval()
    all_generated_prices = []

    # Converter DataFrame para tensor
    data_tensor = torch.tensor(df.values.astype(np.float32), device=device)
    data_loader = DataLoader(data_tensor, batch_size=32, shuffle=False)

    with torch.no_grad():
        for batch_features in data_loader:
            generated_prices = generator(batch_features).detach().cpu().numpy()
            all_generated_prices.extend(generated_prices.flatten())  # Flatten para garantir que seja 1D

    # Adicionando os preços gerados ao DataFrame
    df['generated_price'] = all_generated_prices
    return df

# Contagem correta das características de entrada
input_features = significant_df.drop(columns=[target_variable], errors='ignore')  # Removendo a coluna alvo para garantir

# Verificando a dimensão de entrada
input_dim = input_features.shape[1]
print("Dimensão de entrada confirmada:", input_dim)

# Reinstancia o Gerador com a dimensão de entrada correta
generator = Generator(input_dim=input_dim, output_dim=1).to(device)  # Ajuste output_dim conforme necessário

# Removendo a coluna 'generated_price' que não deve ser incluída como característica de entrada
if 'generated_price' in input_features.columns:
    input_features_adjusted = input_features.drop(columns=['generated_price'])
else:
    input_features_adjusted = input_features

# Verifique novamente a dimensão após remover a coluna
print("Dimensões corrigidas:", input_features_adjusted.shape)

# Aplicar a função para adicionar preços gerados usando o gerador com as dimensões corrigidas
df_with_prices = add_generated_prices(input_features_adjusted, generator, device)
display(df_with_prices.head().T)


In [None]:
# Adicionar coluna com a discrepância dos preços
df_with_prices['price_discrepancy'] = df_with_prices['generated_price'] - (df_with_prices['replenishment_cost'] + df_with_prices['minimum_margin'])

# Ordenar pelo valor absoluto da discrepância para identificar os maiores erros
df_with_prices['abs_price_discrepancy'] = df_with_prices['price_discrepancy'].abs()
worst_cases = df_with_prices.sort_values('abs_price_discrepancy', ascending=False)

# Analisar os casos com maior discrepância
display(worst_cases[['replenishment_cost', 'minimum_margin', 'generated_price', 'price_discrepancy', 'abs_price_discrepancy']].head())
