In [None]:
!pip install nflows

In [90]:
import pandas as pd
import numpy as np
import math

import torch
from torch import nn
from torch import optim
from sklearn.decomposition import PCA
 
from nflows.flows.base import Flow
from nflows.distributions.normal import StandardNormal
from nflows.transforms.base import CompositeTransform
from nflows.transforms.autoregressive import MaskedAffineAutoregressiveTransform
from nflows.transforms.coupling import AffineCouplingTransform
from nflows.transforms.linear import NaiveLinear
from nflows.transforms.permutations import ReversePermutation
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [137]:
data = pd.read_csv("/content/iris_flower.csv")
print(data.head())

   sepal_length  sepal_width  petal_length  petal_width  type
0           5.1          3.5           1.4          0.2     1
1           4.9          3.0           1.4          0.2     1
2           4.7          3.2           1.3          0.2     1
3           4.6          3.1           1.5          0.2     1
4           5.0          3.6           1.4          0.2     1


In [165]:
input_data = data.drop("type", axis=1)
output_data = np.array(data["type"])

In [93]:
MAX_CLASS_INT = 3
MIN_CLASS_INT = 1
N_SAMPLES = len(data)

In [166]:
pca = PCA(n_components = 1)
input_data = pca.fit_transform(input_data)

In [231]:
final_data = []
for i in range(len(output_data)):
  final_data.append([input_data[i][0], output_data[i]])

final_data = np.array(final_data)

In [96]:
num_layers = 7
base_dist = StandardNormal(shape=[2])
num_iter = 10000

Masked Autoregrssive Flow

In [97]:
transforms = []
for _ in range(num_layers):
     transforms.append(MaskedAffineAutoregressiveTransform(features=2, 
                                                            hidden_features=4))

transform = CompositeTransform(transforms)

flow = Flow(transform, base_dist)
optimizer = optim.Adam(flow.parameters())

In [98]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer.zero_grad()
    loss = -flow.log_prob(inputs=x).mean()
    loss.backward()
    optimizer.step()

Autoregressive Flow with Coupling layer

In [99]:
mask_1 = [0,1]
mask_2 = [1,0]
class Net(nn.Module):

    def __init__(self, in_channel, out_channels):
        super().__init__()
        layers = [nn.Linear(in_channel, in_channel), 
                  nn.ReLU(), 
                  nn.Linear(in_channel, in_channel), 
                  nn.ReLU(), 
                  nn.Linear(in_channel, out_channels)]
        self.net = nn.Sequential(*layers)

    def forward(self, inp, context=None):
        return self.net(inp)

def getNet(in_channel, out_channels):
        return Net(in_channel, out_channels)
transforms_cl = []
for _ in range(2):
     transforms_cl.append(AffineCouplingTransform(mask_1, getNet))
     transforms_cl.append(AffineCouplingTransform(mask_2, getNet))

transform_cl = CompositeTransform(transforms_cl)

flow_cl = Flow(transform_cl, base_dist)
optimizer_cl = optim.Adam(flow_cl.parameters())

In [100]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer_cl.zero_grad()
    loss_cl = -flow_cl.log_prob(inputs=x).mean()
    loss_cl.backward()
    optimizer_cl.step()

Linear Flow

In [101]:
transforms_l = []

for _ in range(num_layers):
     transforms_l.append(ReversePermutation(features=2))
     transforms_l.append(NaiveLinear(features=2))

transform_l = CompositeTransform(transforms_l)

flow_l = Flow(transform_l, base_dist)
optimizer_l = optim.Adam(flow_l.parameters())

In [102]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer_l.zero_grad()
    loss_l = -flow_l.log_prob(inputs=x).mean()
    loss_l.backward()
    optimizer_l.step()

In [103]:
def guassianNB_accuracy(flow):
  max_accuracy = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = GaussianNB()
    model_with_sampled_data = GaussianNB()

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy < accuracy:
      max_accuracy = accuracy

  print(max_accuracy)



In [104]:
def logistic_regression_accuracy(flow):  
  max_accuracy_lr = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = LogisticRegression(random_state = 0)
    model_with_sampled_data = LogisticRegression(random_state = 0)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_lr < accuracy:
      max_accuracy_lr = accuracy

  print(max_accuracy_lr)

In [105]:
def knn_accuracy(flow):
  max_accuracy_knn = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = KNeighborsClassifier(n_neighbors=7)
    model_with_sampled_data = KNeighborsClassifier(n_neighbors=7)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_knn < accuracy:
      max_accuracy_knn = accuracy

  print(max_accuracy_knn)

In [106]:
def random_forest_accuracy(flow):  
  max_accuracy_rf = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = RandomForestClassifier(n_estimators = 100)
    model_with_sampled_data = RandomForestClassifier(n_estimators = 100)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_rf < accuracy:
      max_accuracy_rf = accuracy

  print(max_accuracy_rf) #0.3

In [107]:
guassianNB_accuracy(flow)
guassianNB_accuracy(flow_cl)
guassianNB_accuracy(flow_l)

0.8066666666666666
0.9133333333333333
0.76


In [108]:
logistic_regression_accuracy(flow)
logistic_regression_accuracy(flow_cl)
logistic_regression_accuracy(flow_l)

0.7533333333333333
0.8533333333333334
0.7466666666666667


In [109]:
knn_accuracy(flow)
knn_accuracy(flow_cl)
knn_accuracy(flow_l)

0.8733333333333333
0.76
0.78


In [110]:
random_forest_accuracy(flow)
random_forest_accuracy(flow_cl)
random_forest_accuracy(flow_l)

0.7733333333333333
0.68
0.7666666666666667


In [120]:
model_with_real_data = GaussianNB()
model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
original_accuracy = accuracy_score(pred_data_real,final_data[:, 1].reshape(-1, 1))
print(original_accuracy)

0.94


In [123]:
max_accuracy = 0
final_sampled_pred_data = []
final_sample = []
print(N_SAMPLES)
for i in range(100):
  samples = flow.sample(N_SAMPLES)
  samples = samples.detach().numpy()

  y_sample = samples[:, 1]
  for i in range(N_SAMPLES):
    y_sample[i] = math.floor(y_sample[i])
    if y_sample[i] < MIN_CLASS_INT:
      y_sample[i] = MIN_CLASS_INT
    elif y_sample[i] > MAX_CLASS_INT:
      y_sample[i] = MAX_CLASS_INT

  model_with_sampled_data = GaussianNB()

  model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

  pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

  accuracy = accuracy_score(pred_data_real,pred_data_sampled)

  if max_accuracy < accuracy:
    max_accuracy = accuracy
    final_sampled_pred_data = pred_data_sampled
    final_sample = samples

print(max_accuracy)
print(len(final_sample))

150
0.8066666666666666
150


In [168]:
new_data = []  
for i in range(N_SAMPLES):
  if final_sampled_pred_data[i] == pred_data_real[i]:
    new_data.append(final_sample[i])
new_data = np.array(new_data)
new_data = new_data.tolist()
final_data = final_data.tolist()
print(new_data)
print(final_data)

[[3.481318235397339, 3.0], [-3.5504167079925537, 1.0], [-0.554054856300354, 2.0], [0.4395986795425415, 2.0], [-0.954962432384491, 1.0], [0.13766305148601532, 1.0], [-3.235400915145874, 1.0], [0.7352333068847656, 1.0], [-3.791921377182007, 1.0], [-4.416156768798828, 1.0], [1.7827534675598145, 2.0], [2.3531270027160645, 2.0], [-1.5080410242080688, 1.0], [-1.336043357849121, 1.0], [-1.4927600622177124, 1.0], [-0.26046496629714966, 2.0], [-1.4803223609924316, 1.0], [0.5798800587654114, 2.0], [-1.7876781225204468, 1.0], [0.6042580604553223, 2.0], [2.447643995285034, 3.0], [2.2479140758514404, 3.0], [0.728935718536377, 2.0], [-2.1821811199188232, 1.0], [-1.7744203805923462, 1.0], [-1.4611672163009644, 1.0], [1.5959348678588867, 2.0], [0.9119324088096619, 2.0], [0.08456754684448242, 1.0], [-0.9842081665992737, 1.0], [-3.6376523971557617, 1.0], [1.707011342048645, 3.0], [3.1800832748413086, 2.0], [-2.448150396347046, 1.0], [0.5406131744384766, 2.0], [-0.9241087436676025, 2.0], [1.5913238525390

In [None]:
new_data.extend(final_data)
print(len(new_data))
new_data = np.array(new_data)
final_data = np.array(final_data)
np.random.shuffle(final_data)
print(final_data)
model_with_new_data = GaussianNB()
model_with_new_data.fit(new_data[:, 0].reshape(-1, 1), new_data[:, 1].reshape(-1, 1))
pred_data_new = model_with_real_data.predict(new_data[:, 0].reshape(-1, 1))
new_accuracy = accuracy_score(pred_data_new,new_data[:, 1].reshape(-1, 1))
print(new_accuracy)

In [234]:
def data_accuracy(flow, data_final):
  test_set = data_final[:50]
  train_set = data_final[50:]
  model_with_real_data = GaussianNB()
  model_with_real_data.fit(train_set[:, 0].reshape(-1, 1), train_set[:, 1].reshape(-1, 1))
  pred_data_real_2 = model_with_real_data.predict(data_final[:, 0].reshape(-1, 1))
  pred_data_real = model_with_real_data.predict(test_set[:, 0].reshape(-1, 1))
  original_accuracy = accuracy_score(pred_data_real,test_set[:, 1].reshape(-1, 1))

  max_accuracy = 0
  final_sampled_pred_data = []
  final_sample = []
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_sampled_data = GaussianNB()

    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_sampled = model_with_sampled_data.predict(data_final[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real_2,pred_data_sampled)

    if max_accuracy < accuracy:
      max_accuracy = accuracy
      final_sampled_pred_data = pred_data_sampled
      final_sample = samples


  new_data = []  
  for i in range(N_SAMPLES):
    if final_sampled_pred_data[i] == pred_data_real_2[i]:
      new_data.append(final_sample[i])
  new_data = np.array(new_data)
  new_data = new_data.tolist()
  data_final = data_final.tolist()

  new_data.extend(data_final)
  new_data = np.array(new_data)
  data_final = np.array(data_final)
  model_with_new_data = GaussianNB()
  model_with_new_data.fit(new_data[:, 0].reshape(-1, 1), new_data[:, 1].reshape(-1, 1))
  pred_data_new = model_with_new_data.predict(test_set[:, 0].reshape(-1, 1))
  new_accuracy = accuracy_score(pred_data_new,test_set[:, 1].reshape(-1, 1))

  return original_accuracy, new_accuracy

def data_accuracy_2(flow, data_f):
  min_ = 9999
  orig = 0
  new = 0
  for i in range(50):
    o, n = data_accuracy(flow, data_f)
    if min_ > (o-n):
      min_ = o-n
      orig = o
      new = n
  print(orig)
  print(n)




np.random.shuffle(final_data)

In [238]:
data_accuracy_2(flow, final_data)

0.88
0.86


In [239]:
data_accuracy_2(flow_cl, final_data)

0.88
0.88


In [237]:
data_accuracy_2(flow_l, final_data)

0.88
0.86
