In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchvision import datasets, transforms

In [None]:
class EncoderDecoder(nn.Module):
    def __init__(self, n_features, size_embedding):
        super(EncoderDecoder, self).__init__()
        self.fc1 = nn.Linear(n_features, size_embedding)
        self.fc2 = nn.Linear(size_embedding, n_features)

    def forward(self, x, should_output_embedding=False):
        embedding = self.fc1(x)
        x = F.relu(embedding)
        x = self.fc2(x)
        #x = F.relu(x)
        return embedding if should_output_embedding else x

In [None]:
def train_model(dataloader_train, dataloader_test, size_embedding, optimizer, epoch, device="cpu"):
    loss_function = nn.MSELoss()
    model.train()
    for batch_idx, data in enumerate(dataloader_train):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_function(output, data)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(dataloader_train.dataset),
                100. * batch_idx / len(dataloader_train), loss.item()))
        

In [None]:
class PartyEmbeddingDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        #return self.df_data.iloc[idx]
        return torch.Tensor(list(self.df_data.iloc[idx]))

In [None]:
# Load the data and split it into train/test
filename = "data/embeddings_meteor_duplicates_eq0.5.csv"
df_data = pd.read_csv(filename, index_col=0)
df_train = df_data.sample(frac=0.8)
df_test = df_data.drop(df_train.index)

# Create Datasets
dataset_train = PartyEmbeddingDataset(df_train)
dataset_test = PartyEmbeddingDataset(df_test)

# Create DataLoaders
dataloader_train = DataLoader(dataset_train, batch_size=1000, shuffle=True, num_workers=0)
dataloader_test = DataLoader(dataset_test, batch_size=4, shuffle=False, num_workers=0)

In [None]:
# Set some parameters
size_embedding = 10
epochs = 10
lr = 0.5
n_features = len(df_train.columns)
device = "cpu"

model = EncoderDecoder(n_features, size_embedding).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
# Train
for epoch in range(epochs):
    train_model(dataloader_train, dataloader_test, size_embedding=size_embedding, optimizer=optimizer, epoch=epoch, device=device)