In [None]:
!pip install nflows

In [90]:
import pandas as pd
import numpy as np
import math

import torch
from torch import nn
from torch import optim
from sklearn.decomposition import PCA
 
from nflows.flows.base import Flow
from nflows.distributions.normal import StandardNormal
from nflows.transforms.base import CompositeTransform
from nflows.transforms.autoregressive import MaskedAffineAutoregressiveTransform
from nflows.transforms.coupling import AffineCouplingTransform
from nflows.transforms.linear import NaiveLinear
from nflows.transforms.permutations import ReversePermutation
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [91]:
data = pd.read_csv("/content/wheat-seed.csv")
print(data.head())

    Area  Perimeter  Compactness  Length  Width  Asymmetry_coefficient  \
0  15.26      14.84       0.8710   5.763  3.312                  2.221   
1  14.88      14.57       0.8811   5.554  3.333                  1.018   
2  14.29      14.09       0.9050   5.291  3.337                  2.699   
3  13.84      13.94       0.8955   5.324  3.379                  2.259   
4  16.14      14.99       0.9034   5.658  3.562                  1.355   

   Groove_length  Class  
0          5.220      1  
1          4.956      1  
2          4.825      1  
3          4.805      1  
4          5.175      1  


In [92]:
input_data = data.drop("Class", axis=1)
output_data = np.array(data["Class"])

In [93]:
MAX_CLASS_INT = 3
MIN_CLASS_INT = 1
N_SAMPLES = len(data)

In [94]:
pca = PCA(n_components = 1)
input_data = pca.fit_transform(input_data)

In [135]:
final_data = []
for i in range(len(output_data)):
  final_data.append([input_data[i][0], output_data[i]])

final_data = np.array(final_data)

In [96]:
num_layers = 7
base_dist = StandardNormal(shape=[2])
num_iter = 10000

Masked Autoregrssive Flow

In [97]:
transforms = []
for _ in range(num_layers):
     transforms.append(MaskedAffineAutoregressiveTransform(features=2, 
                                                            hidden_features=4))

transform = CompositeTransform(transforms)

flow = Flow(transform, base_dist)
optimizer = optim.Adam(flow.parameters())

In [98]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer.zero_grad()
    loss = -flow.log_prob(inputs=x).mean()
    loss.backward()
    optimizer.step()

Autoregressive Flow with Coupling layer

In [99]:
mask_1 = [0,1]
mask_2 = [1,0]
class Net(nn.Module):

    def __init__(self, in_channel, out_channels):
        super().__init__()
        layers = [nn.Linear(in_channel, in_channel), 
                  nn.ReLU(), 
                  nn.Linear(in_channel, in_channel), 
                  nn.ReLU(), 
                  nn.Linear(in_channel, out_channels)]
        self.net = nn.Sequential(*layers)

    def forward(self, inp, context=None):
        return self.net(inp)

def getNet(in_channel, out_channels):
        return Net(in_channel, out_channels)
transforms_cl = []
for _ in range(2):
     transforms_cl.append(AffineCouplingTransform(mask_1, getNet))
     transforms_cl.append(AffineCouplingTransform(mask_2, getNet))

transform_cl = CompositeTransform(transforms_cl)

flow_cl = Flow(transform_cl, base_dist)
optimizer_cl = optim.Adam(flow_cl.parameters())

transform_cl = CompositeTransform(transforms_cl)

flow_cl = Flow(transform_cl, base_dist)
optimizer_cl = optim.Adam(flow_cl.parameters())

In [100]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer_cl.zero_grad()
    loss_cl = -flow_cl.log_prob(inputs=x).mean()
    loss_cl.backward()
    optimizer_cl.step()

Linear Flow

In [101]:
transforms_l = []

for _ in range(num_layers):
     transforms_l.append(ReversePermutation(features=2))
     transforms_l.append(NaiveLinear(features=2))

transform_l = CompositeTransform(transforms_l)

flow_l = Flow(transform_l, base_dist)
optimizer_l = optim.Adam(flow_l.parameters())

In [102]:
for i in range(num_iter):
    #x, y = datasets.make_circles(n_samples=300, factor=0.5, noise=0.05)
    x = torch.tensor(final_data, dtype=torch.float32)
    optimizer_l.zero_grad()
    loss_l = -flow_l.log_prob(inputs=x).mean()
    loss_l.backward()
    optimizer_l.step()

In [103]:
def guassianNB_accuracy(flow):
  max_accuracy = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = GaussianNB()
    model_with_sampled_data = GaussianNB()

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy < accuracy:
      max_accuracy = accuracy

  print(max_accuracy)



In [104]:
def logistic_regression_accuracy(flow):  
  max_accuracy_lr = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = LogisticRegression(random_state = 0)
    model_with_sampled_data = LogisticRegression(random_state = 0)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_lr < accuracy:
      max_accuracy_lr = accuracy

  print(max_accuracy_lr)

In [105]:
def knn_accuracy(flow):
  max_accuracy_knn = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = KNeighborsClassifier(n_neighbors=7)
    model_with_sampled_data = KNeighborsClassifier(n_neighbors=7)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_knn < accuracy:
      max_accuracy_knn = accuracy

  print(max_accuracy_knn)

In [106]:
def random_forest_accuracy(flow):  
  max_accuracy_rf = 0
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_real_data = RandomForestClassifier(n_estimators = 100)
    model_with_sampled_data = RandomForestClassifier(n_estimators = 100)

    model_with_real_data.fit(final_data[:, 0].reshape(-1, 1), final_data[:, 1].reshape(-1, 1))
    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_real = model_with_real_data.predict(final_data[:, 0].reshape(-1, 1))
    pred_data_sampled = model_with_sampled_data.predict(final_data[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real,pred_data_sampled)

    if max_accuracy_rf < accuracy:
      max_accuracy_rf = accuracy

  print(max_accuracy_rf) #0.3

In [107]:
guassianNB_accuracy(flow)
guassianNB_accuracy(flow_cl)
guassianNB_accuracy(flow_l)

0.6761904761904762
0.49523809523809526
0.38095238095238093


In [108]:
logistic_regression_accuracy(flow)
logistic_regression_accuracy(flow_cl)
logistic_regression_accuracy(flow_l)

0.5476190476190477
0.5857142857142857
0.37142857142857144


In [109]:
knn_accuracy(flow)
knn_accuracy(flow_cl)
knn_accuracy(flow_l)

0.780952380952381
0.5285714285714286
0.44285714285714284


In [110]:
random_forest_accuracy(flow)
random_forest_accuracy(flow_cl)
random_forest_accuracy(flow_l)

0.6857142857142857
0.4857142857142857
0.48095238095238096


In [136]:
def data_accuracy(flow, data_final):
  test_set = data_final[:50]
  train_set = data_final[50:]
  model_with_real_data = GaussianNB()
  model_with_real_data.fit(train_set[:, 0].reshape(-1, 1), train_set[:, 1].reshape(-1, 1))
  pred_data_real_2 = model_with_real_data.predict(data_final[:, 0].reshape(-1, 1))
  pred_data_real = model_with_real_data.predict(test_set[:, 0].reshape(-1, 1))
  original_accuracy = accuracy_score(pred_data_real,test_set[:, 1].reshape(-1, 1))

  max_accuracy = 0
  final_sampled_pred_data = []
  final_sample = []
  for i in range(100):
    samples = flow.sample(N_SAMPLES)
    samples = samples.detach().numpy()

    y_sample = samples[:, 1]
    for i in range(N_SAMPLES):
      y_sample[i] = math.floor(y_sample[i])
      if y_sample[i] < MIN_CLASS_INT:
        y_sample[i] = MIN_CLASS_INT
      elif y_sample[i] > MAX_CLASS_INT:
        y_sample[i] = MAX_CLASS_INT

    model_with_sampled_data = GaussianNB()

    model_with_sampled_data.fit(samples[:, 0].reshape(-1, 1), samples[:, 1].reshape(-1, 1))

    pred_data_sampled = model_with_sampled_data.predict(data_final[:, 0].reshape(-1, 1))

    accuracy = accuracy_score(pred_data_real_2,pred_data_sampled)

    if max_accuracy < accuracy:
      max_accuracy = accuracy
      final_sampled_pred_data = pred_data_sampled
      final_sample = samples


  new_data = []  
  for i in range(N_SAMPLES):
    if final_sampled_pred_data[i] == pred_data_real_2[i]:
      new_data.append(final_sample[i])
  new_data = np.array(new_data)
  new_data = new_data.tolist()
  data_final = data_final.tolist()

  new_data.extend(data_final)
  new_data = np.array(new_data)
  data_final = np.array(data_final)
  model_with_new_data = GaussianNB()
  model_with_new_data.fit(new_data[:, 0].reshape(-1, 1), new_data[:, 1].reshape(-1, 1))
  pred_data_new = model_with_new_data.predict(test_set[:, 0].reshape(-1, 1))
  new_accuracy = accuracy_score(pred_data_new,test_set[:, 1].reshape(-1, 1))

  return original_accuracy, new_accuracy

def data_accuracy_2(flow, data_f):
  min_ = 9999
  orig = 0
  new = 0
  for i in range(50):
    o, n = data_accuracy(flow, data_f)
    if min_ > (o-n):
      min_ = o-n
      orig = o
      new = n
  print(orig)
  print(n)




np.random.shuffle(final_data)

In [137]:
data_accuracy_2(flow, final_data)

0.84
0.9


In [138]:
data_accuracy_2(flow_cl, final_data)

0.84
0.84


In [139]:
data_accuracy_2(flow_l, final_data)

0.84
0.84
