In [1]:
pip install rdt



In [2]:
from collections import namedtuple

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.mixture import BayesianGaussianMixture

from rdt.transformers import OneHotEncodingTransformer

import torch
from torch.nn import Linear, Module, Parameter, ReLU, Sequential
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [3]:
def read_csv(csv_filename, has_header=True, discrete_cols=None):
    """ """
    data = pd.read_csv(csv_filename, header='infer' if has_header else None)

    if discrete_cols:
        discrete_columns = discrete_cols.split(',')
        if not has_header:
            discrete_columns = [int(i) for i in discrete_columns]

    else:
        discrete_columns = []

    return data, discrete_columns

In [4]:

AttrMetaDetails = namedtuple("AttrMetaDetails", ["dim", "activation_fn"])

AttrTransformDetails = namedtuple(
    "AttrTransformDetails", ["column_name", "column_type",
                             "transformer", "transform_aux",
                             "output_info", "output_dimensions"])


class Transformer:
    """
    Each column is transformed independently
    Discrete Columns: 1 hot encoded
    Continuous Columns: Bayesian Gaussian Mixture
                        each value is represented as one hot vector
                        indicating the mode (Beta_i,j) and a scalar value (alpha_i,j)
                        indicating value within mode.
    """

    def __init__(self, n_modes=10, weightage=0.005):
        """Create a data transformer.
        Args:
            n (int):
                Maximum number of Gaussian distributions in Bayesian GMM.
            weightage (float):
                Weight of distribution for a Gaussian distribution to be kept.
        """
        self._n_modes = n_modes
        self._weightage = weightage
        self._attr_dtypes = []
        self._attr_transform_info = []

    def fit_discrete(self, column_name, data):
        """ one hot encoder for discrete column."""

        # ohe = OneHotEncoder()
        # ohe.fit(np.array(raw_column_data).reshape(-1,1))
        # num_categories = len(ohe.get_feature_names())

        ohe = OneHotEncodingTransformer()
        ohe.fit(data)
        num_categories = len(ohe.dummies)

        return AttrTransformDetails(
            column_name=column_name, column_type="discrete", transformer=ohe,
            transform_aux=None,
            output_info=[AttrMetaDetails(num_categories, 'softmax')],
            output_dimensions=num_categories)

    def transform_discrete(self, attr_transform_details, raw_column_data):
        ohe = attr_transform_details.transformer
        return [ohe.transform(raw_column_data)]

    def fit_continuous(self, column_name, data):
        """ bayesian GMM for continuous column"""
        gm = BayesianGaussianMixture(
            self._n_modes,
            weight_concentration_prior_type='dirichlet_process',
            weight_concentration_prior=0.001,
            n_init=2
        )

        gm.fit(data.reshape(-1, 1))
        _weights = gm.weights_
        # valid gaussians
        valid_component_indicator = _weights > self._weightage
        num_components = valid_component_indicator.sum()

        return AttrTransformDetails(
            column_name=column_name, column_type="continuous", transformer=gm,
            transform_aux=valid_component_indicator,
            output_info=[AttrMetaDetails(1, 'tanh'), AttrMetaDetails(num_components, 'softmax')],
            output_dimensions=1 + num_components)

    def transform_continuous(self, attr_transform_details, data):
        gmm = attr_transform_details.transformer

        valid_component_indicator = attr_transform_details.transform_aux
        n_components = valid_component_indicator.sum()

        # N(0,1)
        means = gmm.means_.reshape((1, self._n_modes))
        stds = np.sqrt(gmm.covariances_).reshape((1, self._n_modes))

        # alpha_i_j = c_i_j - eta / 4 * phi for_all-> i and valid gaussians
        normalized_values = ((data - means) / (4 * stds))[:, valid_component_indicator]

        # p_k = pi_k * N_k
        component_probs = gmm.predict_proba(data)[:, valid_component_indicator]

        # beta
        selected_component = np.zeros(len(data), dtype='int')

        for i in range(len(data)):
            component_prob_t = component_probs[i] + 1e-6
            component_prob_t = component_prob_t / component_prob_t.sum()
            selected_component[i] = np.random.choice(
                np.arange(n_components), p=component_prob_t)

        selected_normalized_value = normalized_values[
            np.arange(len(data)), selected_component].reshape([-1, 1])

        selected_normalized_value = np.clip(selected_normalized_value, -.99, .99)

        sel_comp_one_hot_encoded = np.zeros_like(component_probs)

        sel_comp_one_hot_encoded[np.arange(len(data)), selected_component] = 1

        return [selected_normalized_value, sel_comp_one_hot_encoded]

    def fit(self, raw_data, discrete_columns=tuple()):
        """
        fit GMM for continuous columns and One hot encoder for discrete columns.
        """
        self.output_info_list = []
        self.output_dimensions = 0

        if not isinstance(raw_data, pd.DataFrame):
            self.is_dataframe = False
            raw_data = pd.DataFrame(raw_data)
        else:
            self.is_dataframe = True

        self._attr_dtypes = raw_data.infer_objects().dtypes

        self._attr_transform_info = []

        discrete_columns = [raw_data.columns[int(i)] for i in discrete_columns]
        for column_name in raw_data.columns:
            print("Processing column: {}".format(column_name))
            raw_column_data = raw_data[column_name].values
            if column_name in discrete_columns:
                _transform_info = self.fit_discrete(
                    column_name, raw_column_data)
            else:
                _transform_info = self.fit_continuous(
                    column_name, raw_column_data)

            self.output_info_list.append(_transform_info.output_info)
            self.output_dimensions += _transform_info.output_dimensions
            self._attr_transform_info.append(_transform_info)

    def transform(self, raw_data):
        """"""
        if not isinstance(raw_data, pd.DataFrame):
            raw_data = pd.DataFrame(raw_data)

        column_data_list = []
        for attr_transform_info in self._attr_transform_info:
            column_data = raw_data[[attr_transform_info.column_name]].values
            if attr_transform_info.column_type == "continuous":
                column_data_list += self.transform_continuous(attr_transform_info, column_data)
            else:
                assert attr_transform_info.column_type == "discrete"
                column_data_list += self.transform_discrete(
                    attr_transform_info, column_data)

        return np.concatenate(column_data_list, axis=1).astype(float)

    ###

    def inverse_transform_continuous(self, attr_transform_info, attr_data, sigmas, st):
        gm = attr_transform_info.transformer
        valid_component_indicator = attr_transform_info.transform_aux

        sel_normalized_value = attr_data[:, 0]
        sel_component_probs = attr_data[:, 1:]

        if sigmas is not None:
            sig = sigmas[st]
            sel_normalized_value = np.random.normal(sel_normalized_value, sig)

        sel_normalized_value = np.clip(sel_normalized_value, -1, 1)
        comp_probs = np.ones((len(attr_data), self._n_modes)) * -100
        comp_probs[:, valid_component_indicator] = sel_component_probs

        means = gm.means_.reshape([-1])
        stds = np.sqrt(gm.covariances_).reshape([-1])
        selected_component = np.argmax(comp_probs, axis=1)

        std_t = stds[selected_component]
        mean_t = means[selected_component]
        column = sel_normalized_value * 4 * std_t + mean_t

        return column

    def inverse_transform_discrete(self, attr_transform_info, attr_data):
        ohe = attr_transform_info.transformer
        return ohe.reverse_transform(attr_data)

    def inverse_transform(self, data, sigmas=None):
        """
        take matrix data and output raw data.
        is_dataframe: pd is_dataframe else numpy array
        """
        st = 0
        attr_data_list = []
        column_names = []
        for attr_transform_info in self._attr_transform_info:
            dim = attr_transform_info.output_dimensions
            column_data = data[:, st:st + dim]

            if attr_transform_info.column_type == 'continuous':
                attr_data = self.inverse_transform_continuous(
                    attr_transform_info, column_data, sigmas, st)
            else:
                assert attr_transform_info.column_type == 'discrete'
                attr_data = self.inverse_transform_discrete(
                    attr_transform_info, column_data)

            attr_data_list.append(attr_data)
            column_names.append(attr_transform_info.column_name)
            st += dim

        _data = np.column_stack(attr_data_list)
        _data = (pd.DataFrame(_data, columns=column_names).astype(self._attr_dtypes))
        if not self.is_dataframe:
            _data = _data.values

        return _data

    def convert_attr_name_val_to_id(self, attr_name, value):
        discrete_counter = 0
        column_id = 0
        for attr_transform_info in self._attr_transform_info:
            if attr_transform_info.column_name == attr_name:
                break
            if attr_transform_info.column_type == "discrete":
                discrete_counter += 1

            column_id += 1

        else:
            raise ValueError(f"The column_name `{attr_name}` does not exist in the data.")

        one_hot = attr_transform_info.transform.transform(np.array([value]))[0]
        if sum(one_hot) == 0:
            raise ValueError(f"The value `{value}` does not exist in the column `{attr_name}`.")

        return {
            "discrete_col_id": discrete_counter,
            "column_id": column_id,
            "value_id": np.argmax(one_hot)
        }


In [5]:
class Base:
    """
    Base method
    """

    def save(self, path):
        device_backup = self._device
        self.set_device(torch.device("cpu"))
        torch.save(self, path)
        self.set_device(device_backup)

    @classmethod
    def load(cls, path):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = torch.load(path)
        model.set_device(device)
        return model

In [6]:
class Decoder(Module):
    def __init__(self, embedding_dim, decompress_dims, data_dim):
        super(Decoder, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(decompress_dims):
            seq += [Linear(dim, item), ReLU()]
            dim = item

        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)
        self.sigma = Parameter(torch.ones(data_dim) * 0.1)

    def forward(self, input):
        return self.seq(input), self.sigma

class Encoder(Module):
    def __init__(self, data_dim, compress_dims, embedding_dim):
        super(Encoder, self).__init__()
        dim = data_dim
        seq = []
        for item in list(compress_dims):
            seq += [
                Linear(dim, item),
                ReLU()
            ]
            dim = item
        self.seq = Sequential(*seq)
        self.fc1 = Linear(dim, embedding_dim)
        self.fc2 = Linear(dim, embedding_dim)

    def forward(self, input):
        feature = self.seq(input)
        mu = self.fc1(feature)
        logvar = self.fc2(feature)
        std = torch.exp(0.5 * logvar)
        return mu, std, logvar



In [7]:
class VAE(Base):
    """

    """

    def __init__(
        self,
        embedding_dim=128,
        compress_dims=(128, 128),
        decompress_dims=(128, 128),
        l2scale=1e-5,
        batch_size=500,
        epochs=1000,
        loss_factor=2,
        cuda=True
    ):

        self.embedding_dim = embedding_dim
        self.compress_dims = compress_dims
        self.decompress_dims = decompress_dims

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.loss_factor = loss_factor
        self.epochs = epochs

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

    def fit(self, train_data, discrete_columns=tuple()):
        self.transformer = Transformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)
        dataset = TensorDataset(torch.from_numpy(train_data.astype('float32')).to(self._device))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=False)

        data_dim = self.transformer.output_dimensions
        encoder = Encoder(data_dim, self.compress_dims, self.embedding_dim).to(self._device)
        self.decoder = Decoder(self.embedding_dim, self.compress_dims, data_dim).to(self._device)
        optimizerAE = Adam(
            list(encoder.parameters()) + list(self.decoder.parameters()),
            weight_decay=self.l2scale)

        for i in range(self.epochs):
            print("Epoch : {}/{}".format(i,self.epochs))
            for id_, data in enumerate(loader):
                optimizerAE.zero_grad()
                real = data[0].to(self._device)
                mu, std, logvar = encoder(real)
                eps = torch.randn_like(std)
                emb = eps * std + mu
                rec, sigmas = self.decoder(emb)
                loss_1, loss_2 = loss_function(
                    rec, real, sigmas, mu, logvar,
                    self.transformer.output_info_list, self.loss_factor
                )
                loss = loss_1 + loss_2
                loss.backward()
                optimizerAE.step()
                self.decoder.sigma.data.clamp_(0.01, 1.0)

    def sample(self, samples):
        self.decoder.eval()

        steps = samples // self.batch_size + 1
        data = []
        for _ in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            noise = torch.normal(mean=mean, std=std).to(self._device)
            fake, sigmas = self.decoder(noise)
            fake = torch.tanh(fake)
            data.append(fake.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:samples]
        return self.transformer.inverse_transform(data, sigmas.detach().cpu().numpy())

    def set_device(self, device):
        self._device = device
        self.decoder.to(self._device)


def loss_function(recon_x, x, sigmas, mu, logvar, output_info, factor):
    st = 0
    loss = []
    for column_info in output_info:
        for span_info in column_info:
            if span_info.activation_fn != "softmax":
                ed = st + span_info.dim
                std = sigmas[st]
                loss.append(((x[:, st] - torch.tanh(recon_x[:, st])) ** 2 / 2 / (std ** 2)).sum())
                loss.append(torch.log(std) * x.size()[0])
                st = ed

            else:
                ed = st + span_info.dim
                loss.append(cross_entropy(
                    recon_x[:, st:ed], torch.argmax(x[:, st:ed], dim=-1), reduction='sum'))
                st = ed

    assert st == recon_x.size()[1]
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return sum(loss) * factor / x.size()[0], KLD / x.size()[0]


In [9]:
data, discrete_columns = read_csv('taxi.csv', True, '0,1,2,5,6,7')

In [10]:
data = data.loc[1:60000,:]

In [14]:
model = VAE(epochs = 3000)


In [None]:
model.fit(data, tuple(discrete_columns))

Processing column: cat_0
Processing column: cat_1
Processing column: cat_2
Processing column: num_3




Processing column: num_4




Processing column: cat_5
Processing column: cat_6
Processing column: target


In [18]:
model.save('w3.pt')

In [None]:
from google.colab import files

In [None]:
files.download('w')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>