In [29]:
import numpy as np
import json
import pandas as pd
from sklearn import tree
from sklearn.metrics import f1_score

In [47]:
real_data = pd.read_csv('data/breast_cancer.csv',sep=',', index_col=False,
        na_values=['NaN', 'nan', 'NULL', 'null'], low_memory=False)
real_data.drop(real_data.columns[0], axis=1, inplace=True)
real_data.drop(real_data.columns[len(real_data.columns)-1], axis=1, inplace=True)
real_data.fillna(0)
d = dict([(y,x+1) for x,y in enumerate(sorted(set(real_data['diagnosis'])))])
real_data['diagnosis']=[d[x] for x in real_data['diagnosis']]
fake_data = pd.read_csv('experiments/cancer_synthetic.csv',header=-1)
fake_data[0] = fake_data[0].round(0)
fake_data = fake_data.iloc[:400,:]
print(real_data.shape)
print(real_data.head(3))
print(fake_data.head(3))
print(fake_data.shape)

(569, 31)
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          2        17.99         10.38           122.8     1001.0   
1          2        20.57         17.77           132.9     1326.0   
2          2        19.69         21.25           130.0     1203.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   

   symmetry_mean           ...             radius_worst  texture_worst  \
0         0.2419           ...                    25.38          17.33   
1         0.1812           ...                    24.99          23.41   
2         0.2069           ...                    23.57          25.53   

   perimeter_worst  area_worst  smoothness_worst  compactness_worst  \
0            184.6      2019.0  

In [48]:
n_train = 400
n_test = len(real_data)-n_train
#train test split
real_train = real_data.iloc[:n_train,:]
test_set = real_data.iloc[n_train:,:]
test_set_y = test_set[['diagnosis']]
test_set_x = test_set.drop(test_set.columns[0], axis=1, inplace=False)
print(real_train.shape)
print(test_set_x.shape)

(400, 31)
(169, 30)


In [49]:
y_real = real_train[["diagnosis"]].values
x_real = real_train.drop(real_train.columns[0], axis=1, inplace=False)
y_fake = fake_data[0].values
x_fake = fake_data.drop(fake_data.columns[0], axis=1, inplace=False)





In [50]:
model_real = tree.DecisionTreeClassifier(max_depth=20)
model_real.fit(x_real, y_real)
model_fake = tree.DecisionTreeClassifier(max_depth=20)
model_fake.fit(x_fake, y_fake)
pred_real = model_real.predict(test_set_x)
pred_fake = model_fake.predict(test_set_x)
print(f1_score(test_set_y, pred_real, average='macro'))
print(f1_score(test_set_y, pred_fake, average='macro'))
print(pred_real)
print(pred_fake)



0.867370073247
0.770068027211
[2 1 1 1 1 1 2 1 2 1 2 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 1
 1 1 1 1 2 1 1 2 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 2 1 1 1 1
 1 1 2 1 1 2 1 1 1 1 2 1 2 2 1 2 1 2 2 1 1 1 1 1 2 2 1 2 1 2 2 2 1 1 1 2 1
 1 2 2 2 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 2 2 2 1]
[ 2.  1.  1.  1.  1.  1.  2.  1.  2.  1.  1.  1.  1.  2.  2.  1.  1.  2.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.  2.  2.  2.  1.
  1.  1.  1.  1.  1.  2.  1.  1.  1.  2.  2.  2.  2.  2.  1.  2.  1.  1.
  1.  1.  1.  1.  1.  1.  2.  2.  1.  1.  1.  1.  1.  1.  2.  1.  1.  2.
  2.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  2.  1.  2.  2.  1.  1.
  1.  2.  2.  1.  1.  2.  1.  1.  1.  2.  2.  2.  1.  1.  1.  1.  1.  1.
  2.  2.  1.  2.  1.  2.  2.  1.  2.  1.  1.  1.  1.  2.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  2.  1.  2.  1.  2.  1.  2.  1.  1.  2.  1.
  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  

In [51]:
from sklearn import svm
from sklearn.preprocessing import normalize

In [52]:
x_real_n = normalize(x_real)
x_fake_n = normalize(x_fake)
model_real = svm.SVC(C=500)
model_real.fit(x_real_n,y_real)
print(model_real)
model_fake = svm.SVC(C=500)
model_fake.fit(x_fake_n,y_fake)
test_set_n = normalize(test_set_x)
pred_real = model_real.predict(test_set_n)

pred_fake = model_fake.predict(test_set_n)
print(pred_real)
print(pred_fake)
print(f1_score(test_set_y, pred_real, average='macro'))
print(f1_score(test_set_y, pred_fake, average='macro'))




SVC(C=500, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
[2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1
 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1
 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 2 1 2 2 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1
 1 2 2 2 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1]
[ 1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.
  1.  2.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  1.  

  y = column_or_1d(y, warn=True)


In [46]:
from sklearn.neural_network import MLPClassifier
model_real = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(3, 10), random_state=1).fit(x_real_n,y_real)
model_fake = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(3, 10), random_state=1).fit(x_fake_n,y_fake)
pred_real = model_real.predict(test_set_n)

pred_fake = model_fake.predict(test_set_n)
print(pred_real)
print(pred_fake)
print(f1_score(test_set_y, pred_real, average='macro'))
print(f1_score(test_set_y, pred_fake, average='macro'))


[2 1 1 1 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 2 1 2 1
 1 1 1 2 2 1 1 2 1 2 1 2 2 1 2 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 1 2 1 1 2 1 1
 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 2 2 2 2 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 2 1
 1 2 1 2 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 2 1 1 1 1 1
 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2]
[ 2.  1.  1.  1.  1.  1.  2.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  2.
  1.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  2.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  2.  1.  2.  1.  2.
  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.
  2.  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  2.  1.  1.  2.  1.  2.
  1.  2.  2.  1.  1.  1.  1.  1.  2.  2.  2.  1.  1.  2.  1.  1.  1.  1.
  2.  1.  1.  2.  1.  2.  1.  1.  2.  2.  1.  1.  1.  2.  1.  1.  1.  1.
  1.  1.  2.  1.  1.  1.  1.  2.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.

  y = column_or_1d(y, warn=True)


In [2]:
from copulas.univariate.KDEUnivariate import KDEUnivariate
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('data/iris.data.csv')
feature1 = data['feature_01']

In [7]:
ku =KDEUnivariate()
ku.fit(feature1)

UnboundLocalError: local variable 'kde' referenced before assignment