# Multi-label 14 classes

In [1]:
import pandas as pd
from scipy.io import arff
import seaborn as sns
import plotly.plotly as py
import graphviz

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
#definisco qui la distanza:
def normalized_euclidean_distance(x, y):
    return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))

def simple_match_distance(x, y):
    count = 0
    for xi, yi in zip(x, y):
        if xi == yi:
            count += 1
    sim_ratio = 1.0 * count / len(x)
    return 1.0 - sim_ratio


def normalized_square_euclidean_distance(ranges):
    def actual(x, y, xy_ranges):
        return np.sum(np.square(np.abs(x - y) / xy_ranges))
    return lambda x, y: actual(x, y, ranges)


def mad_distance(x, y, mad):
    val = 0.0
    for i in range(len(mad)):
        # print i, 0.0 if mad[i] == 0.0 else 1.0 * np.abs(x[i] - y[i]) / mad[i]
        # print i, np.abs(x[i] - y[i]) / mad[i]
        # val += 0.0 if mad[i] != 0 else 1.0 * np.abs(x[i] - y[i]) / mad[i]
        val += 0.0 if mad[i] == 0.0 else 1.0 * np.abs(x[i] - y[i]) / mad[i]
    # print val
    return val

From the paper:
We account for the presence of mixed types of features by a weighted sum of simple matching coefficient for categorical features, and of the normalized Euclidean distance 6 for continuous features. Formally, assuming $h$ categorical features and $m-h$ continuous ones, we use:

$$d(x,z)=\frac{h}{m}SimpleMatch(x,z)+\frac{m-h}{m} NormEuclid(x,z)$$

In [5]:
def mixed_distance(x, y, discrete, continuous, classes_name, ddist, cdist):
    # type: (pandas.Series, pandas.Series, list, list, list, function, function) -> double
    """
    This function return the mixed distance between instance x and instance y
    :param x: pandas.Series, instance 1
    :param y: pandas.Series, instance 2
    :param discrete: list of str, column names containing categorical variables
    :param continuous: list of str, column names containing non categorical variables
    :param classes_name: list of str, array of column names containing the label
    :param ddist: function, distance function for discrete variables
    :param cdist: function, distance function for continuos variables
    :return: double
    """
    xd = [x[att] for att in discrete if att not in classes_name]
    wd = 0.0
    dd = 0.0
    if len(xd) > 0:
        yd = [y[att] for att in discrete if att not in classes_name]
        wd = 1.0 * len(discrete) / (len(discrete) + len(continuous))
        dd = ddist(xd, yd)

    xc = np.array([x[att] for att in continuous if att not in classes_name])
    wc = 0.0
    cd = 0.0
    if len(xc) > 0:
        yc = np.array([y[att] for att in continuous if att not in classes_name])
        wc = 1.0 * len(continuous) / (len(discrete) + len(continuous))
        cd = cdist(xc, yc)

    return wd * dd + wc * cd

**sorted_distances**: Computes the pairwise distance in the between the selected instance and all the other instances and create a dataframe where this is sorted in ascending order.

In [6]:
def sorted_distances(X2E, i2e, discrete_var, continuous_var, classes_name,label_distance='distance'):
    """
    This function returns the neighours of the instance sorted by closeness,
        the distance metric used is `mixed_distance()`

    :param X2E: dataframe, each row is an instance and the label was given by the black box, should NOT contain column(s) with labels
    :param i2e: pd.Series, instance to be explained
    :param discrete_var: array of str, names of X2E columns containing discrete features
    :param continuous_var: array of str, names of X2E columns containing continuous features
    :param class_name: array of str, name(s) of the column(s) containing the label
    :return: pandas dataframe
    """

    # distance between instance to explain and other instances
    distances = [mixed_distance(i2e,X2E.loc[i],discrete=discrete_var,continuous=continuous_var,classes_name=classes_name,ddist=simple_match_distance,cdist=normalized_euclidean_distance) for i in range(0,len(X2E))]
    output = X2E.reset_index().rename(columns={'index':'old_index_'+label_distance})#.drop('index',1)
    output[label_distance] = pd.Series(distances)
    output = output.sort_values(by=label_distance,ascending=True).reset_index().drop('index',1)
 
    return output

In [7]:
data = arff.loadarff('./data/yeast/yeast.arff')
df = pd.DataFrame(data[0])
df[['Class1','Class2','Class3','Class4','Class5','Class6','Class7','Class8','Class9','Class10','Class11','Class12','Class13','Class14']] = df[['Class1','Class2','Class3','Class4','Class5','Class6','Class7','Class8','Class9','Class10','Class11','Class12','Class13','Class14']].apply(pd.to_numeric)

In [8]:
filter_col_X = [col for col in df if col.startswith('Att')]
filter_col_Y = [col for col in df if col.startswith('Class')]
filter_col_Y_BB = ['BB_'+s for s in filter_col_Y]

X = df[filter_col_X].values
y = df[filter_col_Y].values

## Traino una BB
http://scikit.ml/modelselection.html

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

- The Hamming loss is the fraction of labels that are incorrectly predicted.
- The Jaccard index [1], or Jaccard similarity coefficient, defined as the size of the intersection divided by the size of the union of two label sets, is used to compare set of predicted labels for a sample to the corresponding set of labels in ``y_true``. 


In [11]:
best_rf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1512, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
best_rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1512, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
best_predictions = best_rf.predict(X_test)

In [14]:
#predizioni dopo il tuning degli iperparametri
print('accuracy = '+str(metrics.accuracy_score(y_true=y_test,y_pred=best_predictions)))
print('hamming loss = '+str(metrics.hamming_loss(y_true=y_test,y_pred=best_predictions)))
print('jaccard similairty = '+str(metrics.jaccard_similarity_score(y_true=y_test,y_pred=best_predictions)))
print('precision = '+str(metrics.precision_score(y_true=y_test,y_pred=best_predictions,average='micro')))
print('recall = '+str(metrics.recall_score(y_true=y_test,y_pred=best_predictions,average='micro')))
print('F1_micro = '+str(metrics.f1_score(y_true=y_test,y_pred=best_predictions,average='micro')))
print('F1_macro = '+str(metrics.f1_score(y_true=y_test,y_pred=best_predictions,average='macro')))
print('F1_weighted = '+str(metrics.f1_score(y_true=y_test,y_pred=best_predictions,average='weighted')))

accuracy = 0.16666666666666666
hamming loss = 0.1909237379162191
jaccard similairty = 0.4934091180331782
precision = 0.7700714585960488
recall = 0.5359859566998244
F1_micro = 0.6320510608935656
F1_macro = 0.34801983761795724
F1_weighted = 0.5528125469053737



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



### Creo X2E: test set con label dati dalla black box
Mi dimentico dei label "reali"

In [15]:
BB_predictions_df = pd.DataFrame(best_predictions,columns=filter_col_Y_BB)
Xtest_features_df = pd.DataFrame(X_test,columns=filter_col_X)
#dataframe to explain
X2E = pd.concat([Xtest_features_df,BB_predictions_df],axis=1)

In [16]:
X2E.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14
0,0.061482,0.127804,0.215999,0.189943,0.124694,0.071954,0.046394,-0.021441,-0.060587,-0.028469,-0.019952,0.028983,0.043143,0.087516,0.029437,0.00266,-0.005112,-0.041048,0.034369,0.003726,-0.057606,-0.104495,-0.138357,-0.105189,-0.0804,-0.008246,0.031888,-0.016535,0.005596,0.082979,0.147757,0.091121,-0.123866,-0.17094,-0.052174,0.281631,-0.077426,-0.083157,-0.068826,-0.016678,0.193923,-0.064533,-0.104147,0.001239,-0.104432,-0.035701,-0.089103,0.014318,-0.148917,-0.085334,-0.122563,-0.144482,-0.143222,-0.029602,-0.136684,-0.09473,-0.068043,-0.076612,-0.009192,0.277142,0.145643,0.003184,0.080654,-0.038816,0.005445,0.042896,0.221333,0.172836,-0.038664,-0.041683,0.071969,0.028725,-0.023581,-0.015324,-0.053249,0.170436,0.227595,0.147562,0.04369,0.010136,-0.017629,-0.02095,-0.025749,-0.056531,-0.007347,-0.005763,-0.018099,-0.038037,-0.031455,-0.035959,-0.048102,-0.02554,-0.011738,-0.01244,-0.020078,-0.021314,-0.028762,-0.017133,-0.017462,-0.043039,-0.024187,0.296708,0.118257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.008702,-0.007168,0.024091,-0.051892,-0.033546,-0.01858,-0.06878,-0.054626,-0.141283,-0.067281,-0.108237,-0.105494,-0.132343,0.099859,-0.010677,0.06513,0.05438,-0.046019,0.199514,0.139715,-0.030141,0.009774,-0.307595,-0.269124,0.094717,-0.223808,-0.250668,-0.334658,-0.209568,0.022686,0.022406,-0.006376,-0.038299,-0.004225,-0.030118,0.049705,-0.019818,0.037888,0.030295,0.039865,-0.07082,-0.083896,-0.029911,-0.008532,-0.036551,-0.062115,-0.042056,0.018581,-0.001499,-0.019943,0.007741,0.044692,0.018673,0.018645,0.04252,0.01564,0.00492,0.001833,0.091857,0.402564,0.152758,0.010654,0.112725,0.159427,-0.003994,0.008445,-0.013114,-0.109675,0.129638,0.124569,0.119223,0.030521,-0.064618,-0.05865,-0.08547,-0.04911,-0.028626,-0.060372,0.002359,-0.019531,-0.040046,0.006068,0.026571,-0.053643,-0.02548,-0.024038,-0.031434,-0.039257,-0.043824,0.030883,0.084272,0.032449,-0.04887,0.018959,-0.038525,-0.031748,-0.033063,-0.039728,0.080956,-0.01807,-0.043362,0.00104,0.107119,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.018903,-0.140186,-0.030054,-0.041931,0.147528,0.16956,0.116611,-0.03065,-0.006297,0.005951,0.126634,-0.085406,-0.03989,0.034426,0.087884,0.131398,0.083954,-0.060627,-0.100497,-0.187559,-0.149193,-0.183783,-0.042049,-0.188716,-0.145331,-0.099028,0.001518,-0.076413,-0.034227,-0.085268,-0.09277,-0.207175,-0.046618,0.026566,0.18252,0.201615,0.116619,-0.017722,-0.011136,0.058763,0.101283,0.199707,0.075423,-0.150178,0.020796,0.002385,0.035898,-0.230417,0.101791,0.163186,0.083244,0.082613,-0.035515,-0.129719,-0.013408,-0.047241,0.094441,0.090146,-0.171742,-0.016173,-0.034538,-0.028484,-0.025168,-0.003511,-0.071155,-0.01491,0.038377,0.054276,-0.042185,0.014568,0.001979,-0.027077,-0.016659,-0.009806,-0.034684,-0.134536,0.054507,-0.030522,0.020894,-0.065583,-0.080035,0.087864,-0.068521,0.065086,0.17002,0.206208,0.135803,-0.081012,-0.080619,-0.103203,-0.098256,0.071509,-0.072675,0.113692,-0.06268,0.101885,-0.078998,-0.058211,-0.075284,0.129151,-0.073486,-0.064892,0.021282,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-0.237767,-0.301962,-0.190007,-0.11174,0.107342,0.151138,0.078503,0.126912,-0.1407,-0.110413,-0.213051,-0.072345,0.099195,0.24483,0.140097,0.172761,-0.026466,-0.05256,-0.114385,-0.029327,0.030635,-0.097084,-0.023899,0.088416,0.117679,0.166086,0.144204,0.095606,0.09527,0.085386,0.032839,-0.073011,-0.101862,-0.073431,0.176279,0.19001,0.051356,-0.136759,-0.121242,0.090113,0.210776,0.131554,0.043102,-0.094929,-0.048674,0.005095,0.051176,0.152017,0.06668,0.034598,0.01531,-0.011573,-0.048201,0.033586,0.031377,-0.023597,0.086311,-0.021115,0.008013,-0.039115,-0.102837,-0.212227,-0.070543,-0.061762,0.038306,0.057555,0.046176,-0.04414,-0.006592,-0.025837,-0.136148,-0.084651,0.050802,0.069693,0.047174,0.146361,0.102672,0.013571,-0.01214,0.00291,-0.017459,-0.032257,-0.021303,-0.049918,0.005007,0.002428,0.030908,-0.028672,-0.019971,-0.032455,-0.03294,-0.014226,-0.008098,-0.000901,-0.016153,-0.005755,-0.009729,-0.018397,-0.007275,-0.02085,-0.012201,0.04657,0.111724,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.034992,-0.132651,-0.04522,-0.009992,0.143338,0.139133,0.098853,0.05301,0.12094,0.05806,0.143314,0.066606,0.093242,0.100347,0.058444,0.099575,0.106786,0.034851,-0.088595,0.017767,-0.003586,0.08705,0.172349,0.101408,0.063199,-0.053056,-0.036357,-0.037113,-0.038101,-0.166746,-0.055594,-0.047022,0.077929,0.058345,0.076924,0.060818,0.019543,0.047879,0.035586,0.031315,0.082737,0.071978,0.029647,0.043535,-0.187947,0.018042,-0.086956,0.109164,-0.148705,-0.046978,-0.14755,-0.108417,-0.092422,-0.156219,-0.132041,-0.007101,-0.231554,-0.143001,0.086865,-0.109235,0.016922,0.048225,0.063842,0.012714,0.06493,0.036816,-0.071195,0.007735,0.07792,0.014784,0.025703,0.234242,0.082225,0.039191,0.117717,-0.024803,0.033843,0.05316,0.050249,-0.127706,0.086672,0.356313,0.027566,0.041433,-0.101502,-0.031919,-0.206896,-0.069767,0.006284,0.144671,0.05485,0.043285,-0.111364,0.003688,0.06999,-0.051879,0.122398,0.067157,-0.152005,-0.163755,-0.062062,0.056711,0.008264,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


### Pick an instance we want to explain **i2e**:

In [17]:
i2e=X2E.loc[3]

Assegno ad ogni istanza la propria distanza dall'istanza da spiegare **nello spazio delle features**, nello **spazio dei label**, e in uno "spazio misto" dove considero le classi come se fossero feature discrete.

In [18]:
X2E_wdistances = sorted_distances(X2E,i2e,discrete_var=[0],continuous_var=X2E[filter_col_X].columns.values,classes_name=X2E[filter_col_Y_BB].columns.values,label_distance='feat_space_dist')
X2E_wdistances = sorted_distances(X2E_wdistances,i2e,discrete_var=X2E[filter_col_Y_BB].columns.values,continuous_var=['feat_space_dist','old_index_feat_space_dist'],classes_name=['feat_space_dist','old_index_feat_space_dist'],label_distance='label_space_dist')
X2E_wdistances = sorted_distances(X2E_wdistances,i2e,discrete_var=X2E[filter_col_Y_BB].columns.values,continuous_var=X2E[filter_col_X].columns.values,classes_name=['old_index_label_space_dist','old_index_feat_space_dist','feat_space_dist','label_space_dist'],label_distance='mixed_space_dist')

In [19]:
X2E_wdistances.head()

Unnamed: 0,old_index_mixed_space_dist,old_index_label_space_dist,old_index_feat_space_dist,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14,feat_space_dist,label_space_dist,mixed_space_dist
0,0,0,3,-0.237767,-0.301962,-0.190007,-0.11174,0.107342,0.151138,0.078503,0.126912,-0.1407,-0.110413,-0.213051,-0.072345,0.099195,0.24483,0.140097,0.172761,-0.026466,-0.05256,-0.114385,-0.029327,0.030635,-0.097084,-0.023899,0.088416,0.117679,0.166086,0.144204,0.095606,0.09527,0.085386,0.032839,-0.073011,-0.101862,-0.073431,0.176279,0.19001,0.051356,-0.136759,-0.121242,0.090113,0.210776,0.131554,0.043102,-0.094929,-0.048674,0.005095,0.051176,0.152017,0.06668,0.034598,0.01531,-0.011573,-0.048201,0.033586,0.031377,-0.023597,0.086311,-0.021115,0.008013,-0.039115,-0.102837,-0.212227,-0.070543,-0.061762,0.038306,0.057555,0.046176,-0.04414,-0.006592,-0.025837,-0.136148,-0.084651,0.050802,0.069693,0.047174,0.146361,0.102672,0.013571,-0.01214,0.00291,-0.017459,-0.032257,-0.021303,-0.049918,0.005007,0.002428,0.030908,-0.028672,-0.019971,-0.032455,-0.03294,-0.014226,-0.008098,-0.000901,-0.016153,-0.005755,-0.009729,-0.018397,-0.007275,-0.02085,-0.012201,0.04657,0.111724,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,46,1,415,-0.243101,-0.205672,-0.248694,-0.034219,0.156621,0.154864,0.085112,-0.068013,-0.100219,-0.187126,-0.165675,-0.104043,0.146606,0.186126,0.129681,0.00089,-0.032185,-0.122669,-0.085216,-0.144664,-0.114884,-0.096168,0.008137,0.096948,0.118808,0.171047,0.123592,0.101084,0.133404,0.076853,0.04318,-0.073887,-0.099099,-0.054074,0.247954,0.22395,0.016983,-0.18995,-0.109138,0.088203,0.217785,0.189939,0.053865,-0.117555,-0.012023,0.040345,0.094835,0.087771,0.073481,0.078041,0.041332,-0.012408,-0.028905,0.020585,-0.002694,-0.089443,0.110567,0.009616,-0.028597,-0.025549,-0.120179,-0.159154,-0.040511,-0.099395,0.057176,0.051416,0.032382,-0.042116,0.008329,-0.047935,-0.01598,-0.061355,0.078263,0.0364,-0.017184,0.058671,0.066464,-0.023208,-0.034437,0.011289,-0.022766,-0.038651,-0.006961,-0.040736,0.012969,0.031584,-0.013187,-0.022304,-0.000158,-0.01835,-0.026431,-0.00375,-0.030709,0.011306,-0.016213,-0.010599,-0.021486,-0.023411,-0.004461,-0.025329,-0.021729,0.017858,0.113591,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.065897,0.0,0.050028
2,47,2,603,-0.309162,-0.293449,-0.226471,-0.053738,0.166482,0.211397,0.033018,0.09776,-0.118169,-0.097225,-0.14217,-0.044286,0.12571,0.180444,0.150553,0.056325,-0.025249,-0.078387,-0.142739,-0.212997,-0.200354,-0.149248,-0.03868,0.009394,0.113395,0.092664,0.078161,0.021865,-0.008002,0.022718,-0.020098,-0.172778,-0.004861,-0.071625,0.206893,0.170465,0.021627,-0.127931,-0.081262,0.10122,0.209297,0.160945,0.055857,-0.020829,0.024393,0.08035,0.126588,0.095773,0.068911,0.069201,0.041877,0.029866,-0.008735,0.013846,0.051279,-0.033721,0.102376,0.01173,-0.012276,-0.038979,-0.10877,-0.152151,-0.05784,-0.070232,0.079112,0.104968,0.042377,0.016643,0.021903,-0.058559,-0.052362,-0.099329,0.025441,-0.001119,-0.014078,0.043074,0.110324,-0.024618,0.008439,-0.004752,0.033209,-0.030021,-0.024066,-0.05637,0.003887,-0.006585,-0.015494,-0.034618,-0.027591,-0.041515,-0.041419,-0.014683,0.005708,-0.01239,0.02271,-0.018168,0.015756,0.013164,-0.01129,-0.02426,-0.021463,0.01439,0.109392,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.079536,0.0,0.062152
3,200,3,416,-0.336368,-0.264265,-0.079509,0.079279,0.138122,0.176845,0.113826,0.030805,-0.090455,-0.168568,-0.16757,-0.018146,0.06734,0.155927,0.050966,0.059354,0.031571,-0.064823,-0.090038,-0.181846,-0.195774,-0.152882,0.02722,0.127324,0.173278,0.186433,0.138912,0.082352,0.111821,0.061422,0.03121,-0.074187,-0.182471,-0.123653,0.114784,0.160352,0.084206,-0.180151,-0.133085,0.051809,0.091921,0.169096,0.079361,-0.155516,-0.033242,0.016633,0.053564,-0.076308,0.064204,0.013048,0.086044,0.07442,0.029108,-0.022875,0.043676,-0.001504,0.120665,0.057543,-0.039777,-0.018079,-0.004325,-0.095057,-0.047066,0.088028,0.054502,-0.075925,0.0101,0.008814,-0.038041,0.004918,0.012792,-0.002915,-0.06489,-0.042063,-0.100828,-0.007664,-0.028377,-0.082237,-0.032756,-0.046335,-0.066191,-0.01461,-0.057379,-0.071835,0.094155,0.093084,0.054241,0.021052,0.054333,0.048863,-0.005119,-0.003093,0.001758,0.035596,-0.034915,0.067611,-0.075334,-0.086622,-0.08315,-0.013931,-0.052398,-0.04626,0.089945,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.156201,0.0625,0.138846
4,354,4,385,-0.267527,-0.079551,-0.049354,0.100268,0.061778,0.188135,0.087308,-0.02802,-0.131187,-0.030721,-0.103029,-0.077727,0.080777,0.043315,0.012775,-0.095299,-0.089836,-0.107955,-0.084651,-0.090187,-0.012703,0.079516,0.025249,0.110323,0.184128,0.196778,-0.001126,0.082718,0.172565,0.058631,0.011564,0.050357,-0.058016,-0.095673,0.029358,0.063399,-0.022012,-0.170434,-0.093853,0.074789,0.063942,-0.033589,-0.056124,-0.167528,-0.080016,-0.112979,-0.050972,0.0883,0.220223,0.062193,0.117783,0.011043,-0.029721,0.106423,0.051618,-0.222753,0.136785,-0.048457,0.011089,-0.008389,-0.063392,-0.088741,-0.015866,-0.037244,0.224762,0.063838,0.247152,0.285981,-0.01015,0.001337,-0.212766,-0.161998,-0.017284,-0.019699,0.152642,0.057652,0.075427,0.013559,-0.128663,0.006985,-0.010993,-0.034018,-0.020668,-0.049146,0.004404,0.011469,0.01091,-0.029276,-0.017918,-0.033886,-0.034399,0.003563,-0.031645,0.005313,-0.01462,-0.013181,-0.016351,-0.017334,-0.000351,-0.029102,-0.01758,0.032084,0.126293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.227313,0.125,0.210603


### Prendo sample di primi k vicini nei vari spazi, da questi k vicini genererò dataset sintetici

 `feat_space_dist `
 `label_space_dist `
 `mixed_space_dist `

In [20]:
k = int(0.5*sqrt(len(X2E_wdistances)))

#### sample kNN in dataset reale, spazio delle features

In [21]:
filter_old_indexes = [i for i in X2E_wdistances.columns.values if 'index' in i]

In [22]:
filters_features_space = filter_old_indexes.copy()
filters_features_space.append('label_space_dist') 
filters_features_space.append('mixed_space_dist') 
filters_features_space

['old_index_mixed_space_dist',
 'old_index_label_space_dist',
 'old_index_feat_space_dist',
 'label_space_dist',
 'mixed_space_dist']

In [23]:
sampleKnn_feat_space = X2E_wdistances.drop(filters_features_space,1).sort_values(by='feat_space_dist').reset_index().drop('index',1).loc[0:k]

In [24]:
sampleKnn_feat_space.feat_space_dist

0     0.000000
1     0.065897
2     0.079536
3     0.156201
4     0.227313
5     0.231508
6     0.237725
7     0.244699
8     0.246121
9     0.251230
10    0.275045
11    0.277533
12    0.280690
13    0.282880
14    0.294093
Name: feat_space_dist, dtype: float64

#### sample kNN in dataset reale, spazio dei labels

In [25]:
filters_label_space = filter_old_indexes.copy()
filters_label_space.append('feat_space_dist') 
filters_label_space.append('mixed_space_dist') 
filters_label_space

['old_index_mixed_space_dist',
 'old_index_label_space_dist',
 'old_index_feat_space_dist',
 'feat_space_dist',
 'mixed_space_dist']

In [26]:
sampleKnn_label_space = X2E_wdistances.drop(filters_label_space,1).sort_values(by='label_space_dist').reset_index().drop('index',1).loc[0:k]

In [27]:
sampleKnn_label_space.label_space_dist

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
Name: label_space_dist, dtype: float64

#### sample kNN in dataset reale, mixed space

In [28]:
filters_mixed_space = filter_old_indexes.copy()
filters_mixed_space.append('feat_space_dist') 
filters_mixed_space.append('label_space_dist') 
filters_mixed_space

['old_index_mixed_space_dist',
 'old_index_label_space_dist',
 'old_index_feat_space_dist',
 'feat_space_dist',
 'label_space_dist']

In [29]:
sampleKnn_mixed_space = X2E_wdistances.drop(filters_mixed_space,1).sort_values(by='mixed_space_dist').reset_index().drop('index',1).loc[0:k]

In [30]:
sampleKnn_mixed_space.mixed_space_dist

0     0.000000
1     0.050028
2     0.062152
3     0.138846
4     0.210603
5     0.214332
6     0.219858
7     0.223315
8     0.227321
9     0.234604
10    0.244484
11    0.266596
12    0.268543
13    0.269963
14    0.272337
Name: mixed_space_dist, dtype: float64

### Funzione che genera dataset sintetico

In [31]:
def synthetic_neighborhood_generation(sample_Knn, size, discrete_var, continuous_var, classes_name):
    """This function takes as input:
            sample_Knn: dataframe, K nearest neighbors of the instace
            size: int, the number of synthetic instances to generate
            discrete_var: list, name of columns containing discrete variables
            continuous_var: list, name of columns containing continuos variables
            classes_name: list, name of columns containing the classes labels
        And it generates a synthetic neighbothood of instances sampling from features distributions of the sample of K
        nearest neighbors given
    """
    df = sample_Knn.drop(classes_name,1)
    
    if len(continuous_var)>0:
        print('there are continuos variables')
        cont_cols_synthetic_instances = list()
        for col in continuous_var:
            values = df[col].values
            mu = mean(values)
            sigma = std(values)
            new_values = np.random.normal(mu,sigma,size)
            cont_cols_synthetic_instances.append(new_values)
        
        cont_col_syn_df = pd.DataFrame(data=np.column_stack(cont_cols_synthetic_instances),columns=continuous_var)
    
    if len(discrete_var)>0:
        print('there are discrete variables')
        disc_cols_synthetic_instances = list()
        for col in discrete_var:
            values = df[col].values
            diff_values = np.unique(values)
            prob_values = [1.0 * list(values).count(val) / len(values) for val in diff_values]
            new_values = np.random.choice(diff_values, size, prob_values)
            disc_cols_synthetic_instances.append(new_values)
        
        disc_col_syn_df = pd.DataFrame(data=np.column_stack(disc_cols_synthetic_instances),columns=discrete_var)
    
    if (len(continuous_var)>0)&(len(discrete_var)>0): 
        return pd.concat([cont_col_syn_df,disc_col_syn_df],axis=1)
    
    elif len(continuous_var)==0:
        print('there are no continuous variables')
        return disc_col_syn_df 
    
    elif len(discrete_var)==0:
        print('there are no discrete variables')
        return cont_col_syn_df
    else:
        print('Error')

### Creo dataset sintetico n1: unisco il sample dei k primi vicini nello spazio delle feat. e dei label
Unisco i dataset, senza label, poi dalla distribuzione dell'unione delle features dei vicini di entrambe genero istanze sintetiche che vengono infine labellate dalla BB, assegno dei pesi:
* $\alpha$: percentuale di neighbours che voglio dai vicini nello spazio delle features
* $\beta$: percentuale di neighbours che voglio dai vicini nello spazio dei labels

In [32]:
alpha = 0.7
beta = 0.3

if alpha + beta == 1:
    
    subsample_knn_feat = sampleKnn_feat_space.sample(frac=alpha).reset_index().drop(['index','feat_space_dist'],1)
    subsample_knn_label = sampleKnn_label_space.sample(frac=beta).reset_index().drop(['index','label_space_dist'],1)
    alpha_beta_sample_knn = pd.concat([subsample_knn_feat,subsample_knn_label]).reset_index().drop('index',1)
    
    if len(alpha_beta_sample_knn)<len(sampleKnn_feat_space):
        n = len(sampleKnn_feat_space)-len(alpha_beta_sample_knn) 
        if alpha > beta:
            alpha_beta_sample_knn = pd.concat([alpha_beta_sample_knn,sampleKnn_feat_space.sample(n=n).reset_index().drop(['index','feat_space_dist'],1)])
        else:
            alpha_beta_sample_knn = pd.concat([alpha_beta_sample_knn,sampleKnn_label_space.sample(n=n).reset_index().drop(['index','feat_space_dist'],1)])
        
        alpha_beta_sample_knn = alpha_beta_sample_knn.reset_index().drop('index',1)
else:
    print('"alpha + beta" must be = 1')

In [33]:
alpha_beta_sample_knn 

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14
0,-0.243101,-0.205672,-0.248694,-0.034219,0.156621,0.154864,0.085112,-0.068013,-0.100219,-0.187126,-0.165675,-0.104043,0.146606,0.186126,0.129681,0.00089,-0.032185,-0.122669,-0.085216,-0.144664,-0.114884,-0.096168,0.008137,0.096948,0.118808,0.171047,0.123592,0.101084,0.133404,0.076853,0.04318,-0.073887,-0.099099,-0.054074,0.247954,0.22395,0.016983,-0.18995,-0.109138,0.088203,0.217785,0.189939,0.053865,-0.117555,-0.012023,0.040345,0.094835,0.087771,0.073481,0.078041,0.041332,-0.012408,-0.028905,0.020585,-0.002694,-0.089443,0.110567,0.009616,-0.028597,-0.025549,-0.120179,-0.159154,-0.040511,-0.099395,0.057176,0.051416,0.032382,-0.042116,0.008329,-0.047935,-0.01598,-0.061355,0.078263,0.0364,-0.017184,0.058671,0.066464,-0.023208,-0.034437,0.011289,-0.022766,-0.038651,-0.006961,-0.040736,0.012969,0.031584,-0.013187,-0.022304,-0.000158,-0.01835,-0.026431,-0.00375,-0.030709,0.011306,-0.016213,-0.010599,-0.021486,-0.023411,-0.004461,-0.025329,-0.021729,0.017858,0.113591,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.302206,-0.162303,-0.278277,-0.196312,-0.026792,0.203443,0.122917,0.044974,-0.070958,-0.187087,-0.184181,-0.289375,-0.054468,0.001237,0.144244,0.029978,0.115914,0.056259,0.064454,-0.059129,-0.090414,-0.228573,-0.140069,0.036917,0.087933,0.194145,0.083581,0.054361,0.090298,0.097091,0.063689,-0.067058,0.040568,0.007762,0.02476,-0.085931,-0.060003,-0.025099,0.086221,0.120391,-0.056475,-0.159217,-0.064481,0.019805,-0.048644,0.032503,-0.02168,-0.054386,0.092241,0.14089,0.17756,0.152233,0.144961,-0.095067,0.130171,0.035171,0.129379,0.087408,-0.008673,-0.037922,-0.088678,-0.100545,-0.014204,-0.015272,0.066322,0.021265,0.052252,0.063216,-0.010259,-0.051259,-0.042655,-0.074424,-0.043143,-0.131647,-0.062004,-0.049491,0.02064,-0.099425,-0.053999,0.005743,-0.019265,-0.035943,-0.022161,-0.052948,0.009595,0.001179,0.007856,-0.024307,-0.014969,-0.036196,-0.03644,-0.015967,-0.033148,-0.004722,-0.01119,-0.003662,0.004883,0.001894,0.003226,-0.017343,-0.021002,0.035864,0.116767,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.183731,-0.195944,-0.154715,-0.059055,0.077589,0.243455,0.142082,0.146985,0.046971,-0.092452,-0.094777,-0.168326,-0.010605,0.09099,0.189122,0.122119,0.153408,0.016591,-0.116603,-0.137195,-0.109229,-0.166096,-0.059297,-0.117435,-0.048793,0.042291,0.01972,-0.029983,-0.120303,0.031036,0.051984,-0.123478,-0.081108,-0.036929,0.047079,0.135717,0.13268,-0.013113,-0.085498,-0.087054,0.105766,0.098831,0.085083,-0.122368,-0.02886,-0.104647,-0.044358,0.047075,0.098086,0.064983,0.06221,0.037726,0.040573,0.015673,0.017591,-0.022054,0.143305,0.101794,0.028564,-0.014931,-0.020495,-0.077336,0.029985,0.069068,0.110314,-0.006339,0.031139,-0.052189,-0.205151,0.053517,-0.077472,-0.094407,0.074739,-0.02519,0.002498,0.020265,0.031952,0.006413,-0.050731,0.010261,-0.065656,-0.038285,0.031027,0.01658,-0.061394,0.064925,-0.004127,-0.060079,0.084924,0.046177,0.163798,0.417136,-0.073816,-0.006896,-0.066373,-0.051539,-0.065815,-0.065641,-0.030185,-0.074561,-0.073309,-0.006347,-0.007961,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-0.076461,-0.110861,-0.044888,0.140263,0.180233,0.203281,0.090254,0.158931,0.152825,-0.045237,0.072124,0.072205,0.155549,0.214921,0.103413,0.140575,0.044001,0.040024,-0.085834,-0.090243,-0.00595,0.07052,0.117486,0.169685,0.183136,0.08282,0.017288,0.012843,0.104436,-0.025192,-0.077491,-0.124187,-0.075698,-0.076745,0.021168,0.063056,0.009477,-0.101562,-0.100163,-0.018974,0.038594,0.045948,0.01686,-0.006556,-0.077086,-0.095915,-0.049652,0.085769,0.010153,-0.036611,-0.03601,-0.035432,-0.013148,0.01899,0.003663,0.006082,-0.043793,-0.026452,0.000715,-0.101743,-0.230555,-0.185318,-0.044631,-0.171782,0.21102,0.077043,0.267048,0.259431,-0.010318,0.113252,-0.193173,-0.131387,0.112887,0.109167,0.190072,0.09828,-0.002846,-0.100783,-0.116131,0.002025,-0.015943,-0.017404,-0.007785,-0.041461,-0.000963,0.001347,-0.005968,-0.021875,-0.021775,-0.027963,-0.036266,-0.00429,-0.028749,0.008519,-0.00694,-0.016179,-0.017997,-0.013811,-0.009548,-0.032718,0.01796,0.023992,0.116674,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.267527,-0.079551,-0.049354,0.100268,0.061778,0.188135,0.087308,-0.02802,-0.131187,-0.030721,-0.103029,-0.077727,0.080777,0.043315,0.012775,-0.095299,-0.089836,-0.107955,-0.084651,-0.090187,-0.012703,0.079516,0.025249,0.110323,0.184128,0.196778,-0.001126,0.082718,0.172565,0.058631,0.011564,0.050357,-0.058016,-0.095673,0.029358,0.063399,-0.022012,-0.170434,-0.093853,0.074789,0.063942,-0.033589,-0.056124,-0.167528,-0.080016,-0.112979,-0.050972,0.0883,0.220223,0.062193,0.117783,0.011043,-0.029721,0.106423,0.051618,-0.222753,0.136785,-0.048457,0.011089,-0.008389,-0.063392,-0.088741,-0.015866,-0.037244,0.224762,0.063838,0.247152,0.285981,-0.01015,0.001337,-0.212766,-0.161998,-0.017284,-0.019699,0.152642,0.057652,0.075427,0.013559,-0.128663,0.006985,-0.010993,-0.034018,-0.020668,-0.049146,0.004404,0.011469,0.01091,-0.029276,-0.017918,-0.033886,-0.034399,0.003563,-0.031645,0.005313,-0.01462,-0.013181,-0.016351,-0.017334,-0.000351,-0.029102,-0.01758,0.032084,0.126293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5,-0.052108,-0.161482,-0.027452,0.032989,0.149861,0.004815,0.200537,0.01134,-0.024962,-0.21594,-0.046181,-0.056054,0.074759,-0.047305,0.164155,-0.030505,0.147382,-0.128027,-0.010142,0.036443,-0.02271,-0.134901,-0.027017,0.017153,0.097821,0.070221,0.010147,-0.039625,0.095962,0.013738,-0.052755,-0.045765,-0.194104,-0.208831,-0.095741,0.049823,0.097793,-0.075547,0.026637,0.022588,0.025472,0.070954,0.020685,-0.123453,-0.030213,-0.057533,-0.027424,-0.158441,0.122316,0.128288,0.080531,0.029752,-0.014091,-0.025864,-0.105384,-0.144527,0.202435,0.181414,0.021993,-0.075431,-0.099751,-0.099754,-0.023524,-0.051152,0.102622,-0.097046,0.091864,0.063193,0.07149,-0.262371,-0.082718,-0.037528,0.179537,0.1944,0.236346,0.222341,0.212712,-0.075165,-0.150152,0.015804,-0.012311,-0.036666,-0.022964,-0.053681,0.001787,0.014897,-0.007176,-0.03146,-0.021273,-0.029581,-0.030637,0.005513,-0.033961,-0.001964,-0.001006,-0.015433,-0.018961,0.008573,-0.007959,-0.033607,-0.020191,0.034205,0.127585,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
6,-0.146756,-0.11159,-0.165122,-0.037362,0.026046,0.052774,0.099808,0.032864,0.054813,-0.106088,-0.104797,-0.091357,-0.006019,0.128037,0.080887,0.111777,0.026992,0.020741,-0.158365,-0.214472,-0.207301,-0.084839,-0.06416,0.065377,0.127629,0.118839,0.138496,0.13958,0.106954,0.109529,0.059252,0.059723,-0.188538,-0.106119,0.047565,0.171403,0.184056,0.077466,-0.083791,0.013553,0.084641,0.136296,0.112221,-0.016391,-0.051316,-0.055622,0.001248,0.031705,-0.019595,-0.092728,-0.087663,-0.102855,-0.094447,0.062678,-0.007712,0.024481,-0.049042,-0.117824,0.078467,-0.014279,-0.001712,0.080151,0.094865,0.044775,0.017855,-0.03315,0.145507,0.138047,0.049348,-0.032237,0.013836,0.067833,0.016163,0.050376,0.072886,0.057707,0.039833,-0.029351,-0.013791,0.081411,-0.089813,-0.155705,-0.123209,0.044886,0.152912,0.345896,0.06194,0.19175,0.04615,-0.083928,-0.09498,-0.032325,-0.117819,0.01801,-0.027742,-0.016095,-0.005644,0.117596,-0.051041,-0.045131,-0.094982,-0.077043,-0.002994,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7,-0.336368,-0.264265,-0.079509,0.079279,0.138122,0.176845,0.113826,0.030805,-0.090455,-0.168568,-0.16757,-0.018146,0.06734,0.155927,0.050966,0.059354,0.031571,-0.064823,-0.090038,-0.181846,-0.195774,-0.152882,0.02722,0.127324,0.173278,0.186433,0.138912,0.082352,0.111821,0.061422,0.03121,-0.074187,-0.182471,-0.123653,0.114784,0.160352,0.084206,-0.180151,-0.133085,0.051809,0.091921,0.169096,0.079361,-0.155516,-0.033242,0.016633,0.053564,-0.076308,0.064204,0.013048,0.086044,0.07442,0.029108,-0.022875,0.043676,-0.001504,0.120665,0.057543,-0.039777,-0.018079,-0.004325,-0.095057,-0.047066,0.088028,0.054502,-0.075925,0.0101,0.008814,-0.038041,0.004918,0.012792,-0.002915,-0.06489,-0.042063,-0.100828,-0.007664,-0.028377,-0.082237,-0.032756,-0.046335,-0.066191,-0.01461,-0.057379,-0.071835,0.094155,0.093084,0.054241,0.021052,0.054333,0.048863,-0.005119,-0.003093,0.001758,0.035596,-0.034915,0.067611,-0.075334,-0.086622,-0.08315,-0.013931,-0.052398,-0.04626,0.089945,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8,-0.080961,-0.034301,-0.082809,0.140107,0.05463,0.261941,0.057939,0.240662,0.124766,-0.092193,-0.010755,-0.031434,-0.075058,0.22944,0.176864,0.267778,0.045184,0.122322,-0.056013,-0.09975,0.087519,-0.022989,-0.032427,0.143512,0.210412,0.048912,0.060345,0.008959,0.054179,-0.06138,0.036521,-0.042307,-0.09861,-0.150269,0.014007,0.083485,-0.072432,-0.127895,-0.149852,-0.042276,0.252177,-0.001752,-0.128037,0.10159,-0.11692,-0.115481,-0.008058,0.074249,0.035289,-0.015102,-0.034358,-0.059743,-0.061849,0.130952,-0.005376,-0.064676,0.051497,-0.06371,-0.085539,-0.124903,-0.137222,-0.084409,-0.068256,-0.043071,0.128147,0.104877,0.07993,0.079207,0.037931,-0.141979,5.6e-05,-0.023329,0.225417,0.120051,0.081139,-0.066551,-0.026568,0.094547,-0.19517,-0.002115,-0.043866,-0.060947,-0.011203,0.026289,0.031887,0.008568,-0.035643,-0.04343,-0.011608,0.006561,0.008937,0.037659,-0.049705,0.061571,-0.042326,-0.022349,-0.030051,-0.023114,0.015622,-0.054995,-0.018296,0.014501,0.110608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9,-0.309162,-0.293449,-0.226471,-0.053738,0.166482,0.211397,0.033018,0.09776,-0.118169,-0.097225,-0.14217,-0.044286,0.12571,0.180444,0.150553,0.056325,-0.025249,-0.078387,-0.142739,-0.212997,-0.200354,-0.149248,-0.03868,0.009394,0.113395,0.092664,0.078161,0.021865,-0.008002,0.022718,-0.020098,-0.172778,-0.004861,-0.071625,0.206893,0.170465,0.021627,-0.127931,-0.081262,0.10122,0.209297,0.160945,0.055857,-0.020829,0.024393,0.08035,0.126588,0.095773,0.068911,0.069201,0.041877,0.029866,-0.008735,0.013846,0.051279,-0.033721,0.102376,0.01173,-0.012276,-0.038979,-0.10877,-0.152151,-0.05784,-0.070232,0.079112,0.104968,0.042377,0.016643,0.021903,-0.058559,-0.052362,-0.099329,0.025441,-0.001119,-0.014078,0.043074,0.110324,-0.024618,0.008439,-0.004752,0.033209,-0.030021,-0.024066,-0.05637,0.003887,-0.006585,-0.015494,-0.034618,-0.027591,-0.041515,-0.041419,-0.014683,0.005708,-0.01239,0.02271,-0.018168,0.015756,0.013164,-0.01129,-0.02426,-0.021463,0.01439,0.109392,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [34]:
size = int(100*k)
print(str(size)+' vicini')

synthetic_neighborhood1 = synthetic_neighborhood_generation(alpha_beta_sample_knn, size, discrete_var=[], continuous_var=filter_col_X, classes_name=filter_col_Y_BB)
BB_label_syn1_df = pd.DataFrame(best_rf.predict(synthetic_neighborhood1.values),columns=filter_col_Y_BB)
synthetic_neighborhood1 = pd.concat([synthetic_neighborhood1,BB_label_syn1_df],1)
synthetic_neighborhood1.head()

1400 vicini
there are continuos variables
there are no discrete variables


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14
0,-0.080177,-0.168137,-0.080348,0.023681,-0.042954,0.284145,0.023936,-0.00769,0.001033,-0.133736,0.075471,-0.098012,0.011584,-0.032595,0.11035,0.056445,-0.001598,0.11329,-0.025122,-0.191263,-0.01296,0.082896,-0.051106,0.043014,0.127816,0.088249,-0.060055,0.195707,0.120609,-0.020257,0.031589,0.045554,0.059229,-0.104252,0.178293,-0.168078,-0.046579,-0.151475,0.161731,0.042447,0.047574,0.007453,-0.010863,0.085711,-0.053864,0.006428,-0.065489,-0.035464,-0.082317,0.067922,0.001618,0.05505,0.055793,0.031278,0.024059,0.017784,0.001373,0.031939,0.13745,-0.102773,0.006853,0.059138,-0.144897,0.050659,0.053505,0.013912,0.124368,-0.011844,0.079682,0.157107,0.109463,0.052684,0.001081,0.009345,-0.012047,-0.039091,0.060212,-0.040271,-0.016296,-0.064356,-0.093024,-0.005397,-0.120959,0.001356,0.098318,-0.040719,-0.038424,-0.011188,-0.011697,0.022005,-0.016878,-0.0841,-0.075765,-0.063421,-0.008307,-0.000902,-0.08276,0.024945,0.047059,-0.064928,-0.070056,0.04965,0.151257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.198673,-0.016393,-0.170693,0.040807,0.084397,0.178961,0.111434,0.037508,-0.006224,-0.14421,-0.050014,-0.201471,-0.013312,0.074762,-0.011939,-0.008849,0.02048,-0.09593,-0.035969,-0.306002,-0.178684,-0.293064,-0.038722,-0.006492,0.098409,-0.040533,0.0039,-0.031406,0.063551,0.088103,-0.118628,-0.125115,-0.134383,-0.112782,0.19232,0.108146,-0.061073,-0.079373,0.007579,-0.018485,0.074964,0.09055,-0.105221,-0.010796,-0.02145,0.039394,-0.019606,0.148313,0.223249,0.076589,-0.058634,-0.01412,0.010992,0.0685,-0.021688,-0.134741,0.041378,-0.054923,-0.03031,0.015657,-0.215853,0.251099,-0.013723,-0.043094,0.216513,-0.014687,0.143381,0.038736,0.014721,-0.057348,0.061444,0.138357,0.016105,-0.083404,-0.006476,-0.037335,-0.009953,-0.023181,-0.115464,0.045286,-0.050906,-0.059943,-0.117538,0.01599,0.055373,0.014871,-0.071863,-0.028605,-0.050722,-0.052353,-0.078032,0.000172,-0.095645,-0.047771,-0.052756,-0.003076,-0.052888,-0.096634,0.004388,-0.103126,-0.068366,0.032212,0.073869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.090238,0.024344,-0.220889,0.019558,0.16254,0.162894,0.013841,-0.129302,-0.069627,-0.137394,0.052425,-0.180605,0.025195,0.088593,0.113404,-0.022918,0.015918,-0.02064,-0.195,-0.020015,0.070035,-0.10319,-0.093525,0.147581,0.113357,0.103223,0.049459,0.005665,0.043995,0.101917,0.001737,0.059498,-0.022344,-0.173965,0.068217,0.04065,-0.103499,-0.126785,-0.002279,-0.033184,0.135005,0.087095,0.056328,-0.027119,-0.060619,0.084389,-0.077832,-0.034687,-0.007226,0.099189,0.240767,-0.00486,-0.083115,0.111537,-0.027964,-0.131424,-0.033596,0.098501,0.1509,-0.068642,-0.024645,-0.101948,0.048452,0.047557,0.24349,-0.007331,0.080505,0.068471,0.040943,-0.000898,0.052772,0.029582,0.033484,-0.034635,0.040016,0.016705,0.052168,-0.05315,-0.092008,0.031918,-0.049236,-0.1214,0.03261,-0.058589,0.087389,0.260953,0.009972,0.062261,0.021391,-0.006751,0.053372,0.342117,0.000379,0.033853,-0.055005,-0.045759,-0.043372,0.061727,-0.007066,-0.019559,-0.026996,-0.039434,0.08169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.121114,0.003614,-0.153254,0.140673,0.107849,0.217596,-0.049127,-0.00689,0.077595,-0.243613,0.081979,-0.10119,0.222437,0.109391,0.028538,0.175443,0.099547,-0.110838,-0.068626,-0.055105,-0.107328,-0.162394,-0.036344,0.165536,0.194748,0.003236,0.146274,-0.034047,0.146063,-0.010951,0.008443,-0.088338,-0.04336,0.038539,-0.107727,-0.013858,-0.01363,-0.036914,-0.044939,-0.175093,0.121743,-0.193278,0.129085,0.002449,0.034715,-0.108215,0.050274,0.062753,0.053771,-0.012528,0.046792,-0.065501,-0.025041,-0.144958,-0.033263,-0.014691,0.108659,0.085926,0.043048,-0.017156,-0.09601,-0.053097,0.033081,0.043402,0.115727,-0.018764,-0.079365,0.171921,0.110297,-0.106157,-0.126535,-0.046837,0.099098,-0.093723,-0.201189,-0.035009,0.110105,-0.083129,-0.105965,0.032406,-0.015815,-0.033993,-0.04216,-0.058506,0.106703,0.064055,-0.034181,0.003627,0.026592,0.132104,-0.018875,0.127446,-0.030245,-0.093328,-0.039177,0.042114,-0.064341,-0.013963,-0.027618,0.07491,-0.012665,-0.081417,0.117197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.242816,-0.162936,-0.176508,0.059907,-0.059655,0.044333,-0.135013,0.110968,0.061012,-0.017327,0.091175,-0.110996,0.072466,0.19256,-0.040967,0.026259,-0.015775,-0.130607,0.027619,-0.059249,-0.012514,0.062344,-0.08862,0.006488,-0.024113,0.053363,-0.006114,-0.024836,-0.0611,0.060122,0.042616,-0.083301,0.034907,0.107631,0.042694,-0.055015,-0.068824,0.043625,0.010343,-0.057001,-0.106288,0.073943,-0.087011,-0.004456,-0.065006,-0.087566,0.03769,-0.234745,-0.195124,-0.015132,0.008654,-0.033462,-0.070632,-0.004758,0.029104,-0.01467,0.130965,-0.013947,0.04093,-0.102331,-0.108391,0.018882,-0.100109,-0.018777,0.048824,-0.056526,0.072247,-0.063031,-0.036143,0.006661,0.083124,0.03436,0.038057,-0.029203,0.162713,0.071734,0.164904,-0.07173,-0.053981,-0.050676,-0.003923,0.010638,0.027926,0.007916,0.018391,0.117177,-0.052376,0.046788,-0.026218,-0.002796,-0.01521,0.201468,-0.116508,0.081488,-0.070671,-0.072225,-0.0287,-0.015675,-0.029699,-0.026765,-0.130795,-0.040663,0.042022,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


### Creo dataset sintetico n2: uso k i più vicini nello spazio "misto" fatto da features e labels

In [35]:
sampleKnn_mixed_space

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14,mixed_space_dist
0,-0.237767,-0.301962,-0.190007,-0.11174,0.107342,0.151138,0.078503,0.126912,-0.1407,-0.110413,-0.213051,-0.072345,0.099195,0.24483,0.140097,0.172761,-0.026466,-0.05256,-0.114385,-0.029327,0.030635,-0.097084,-0.023899,0.088416,0.117679,0.166086,0.144204,0.095606,0.09527,0.085386,0.032839,-0.073011,-0.101862,-0.073431,0.176279,0.19001,0.051356,-0.136759,-0.121242,0.090113,0.210776,0.131554,0.043102,-0.094929,-0.048674,0.005095,0.051176,0.152017,0.06668,0.034598,0.01531,-0.011573,-0.048201,0.033586,0.031377,-0.023597,0.086311,-0.021115,0.008013,-0.039115,-0.102837,-0.212227,-0.070543,-0.061762,0.038306,0.057555,0.046176,-0.04414,-0.006592,-0.025837,-0.136148,-0.084651,0.050802,0.069693,0.047174,0.146361,0.102672,0.013571,-0.01214,0.00291,-0.017459,-0.032257,-0.021303,-0.049918,0.005007,0.002428,0.030908,-0.028672,-0.019971,-0.032455,-0.03294,-0.014226,-0.008098,-0.000901,-0.016153,-0.005755,-0.009729,-0.018397,-0.007275,-0.02085,-0.012201,0.04657,0.111724,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-0.243101,-0.205672,-0.248694,-0.034219,0.156621,0.154864,0.085112,-0.068013,-0.100219,-0.187126,-0.165675,-0.104043,0.146606,0.186126,0.129681,0.00089,-0.032185,-0.122669,-0.085216,-0.144664,-0.114884,-0.096168,0.008137,0.096948,0.118808,0.171047,0.123592,0.101084,0.133404,0.076853,0.04318,-0.073887,-0.099099,-0.054074,0.247954,0.22395,0.016983,-0.18995,-0.109138,0.088203,0.217785,0.189939,0.053865,-0.117555,-0.012023,0.040345,0.094835,0.087771,0.073481,0.078041,0.041332,-0.012408,-0.028905,0.020585,-0.002694,-0.089443,0.110567,0.009616,-0.028597,-0.025549,-0.120179,-0.159154,-0.040511,-0.099395,0.057176,0.051416,0.032382,-0.042116,0.008329,-0.047935,-0.01598,-0.061355,0.078263,0.0364,-0.017184,0.058671,0.066464,-0.023208,-0.034437,0.011289,-0.022766,-0.038651,-0.006961,-0.040736,0.012969,0.031584,-0.013187,-0.022304,-0.000158,-0.01835,-0.026431,-0.00375,-0.030709,0.011306,-0.016213,-0.010599,-0.021486,-0.023411,-0.004461,-0.025329,-0.021729,0.017858,0.113591,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.050028
2,-0.309162,-0.293449,-0.226471,-0.053738,0.166482,0.211397,0.033018,0.09776,-0.118169,-0.097225,-0.14217,-0.044286,0.12571,0.180444,0.150553,0.056325,-0.025249,-0.078387,-0.142739,-0.212997,-0.200354,-0.149248,-0.03868,0.009394,0.113395,0.092664,0.078161,0.021865,-0.008002,0.022718,-0.020098,-0.172778,-0.004861,-0.071625,0.206893,0.170465,0.021627,-0.127931,-0.081262,0.10122,0.209297,0.160945,0.055857,-0.020829,0.024393,0.08035,0.126588,0.095773,0.068911,0.069201,0.041877,0.029866,-0.008735,0.013846,0.051279,-0.033721,0.102376,0.01173,-0.012276,-0.038979,-0.10877,-0.152151,-0.05784,-0.070232,0.079112,0.104968,0.042377,0.016643,0.021903,-0.058559,-0.052362,-0.099329,0.025441,-0.001119,-0.014078,0.043074,0.110324,-0.024618,0.008439,-0.004752,0.033209,-0.030021,-0.024066,-0.05637,0.003887,-0.006585,-0.015494,-0.034618,-0.027591,-0.041515,-0.041419,-0.014683,0.005708,-0.01239,0.02271,-0.018168,0.015756,0.013164,-0.01129,-0.02426,-0.021463,0.01439,0.109392,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.062152
3,-0.336368,-0.264265,-0.079509,0.079279,0.138122,0.176845,0.113826,0.030805,-0.090455,-0.168568,-0.16757,-0.018146,0.06734,0.155927,0.050966,0.059354,0.031571,-0.064823,-0.090038,-0.181846,-0.195774,-0.152882,0.02722,0.127324,0.173278,0.186433,0.138912,0.082352,0.111821,0.061422,0.03121,-0.074187,-0.182471,-0.123653,0.114784,0.160352,0.084206,-0.180151,-0.133085,0.051809,0.091921,0.169096,0.079361,-0.155516,-0.033242,0.016633,0.053564,-0.076308,0.064204,0.013048,0.086044,0.07442,0.029108,-0.022875,0.043676,-0.001504,0.120665,0.057543,-0.039777,-0.018079,-0.004325,-0.095057,-0.047066,0.088028,0.054502,-0.075925,0.0101,0.008814,-0.038041,0.004918,0.012792,-0.002915,-0.06489,-0.042063,-0.100828,-0.007664,-0.028377,-0.082237,-0.032756,-0.046335,-0.066191,-0.01461,-0.057379,-0.071835,0.094155,0.093084,0.054241,0.021052,0.054333,0.048863,-0.005119,-0.003093,0.001758,0.035596,-0.034915,0.067611,-0.075334,-0.086622,-0.08315,-0.013931,-0.052398,-0.04626,0.089945,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.138846
4,-0.267527,-0.079551,-0.049354,0.100268,0.061778,0.188135,0.087308,-0.02802,-0.131187,-0.030721,-0.103029,-0.077727,0.080777,0.043315,0.012775,-0.095299,-0.089836,-0.107955,-0.084651,-0.090187,-0.012703,0.079516,0.025249,0.110323,0.184128,0.196778,-0.001126,0.082718,0.172565,0.058631,0.011564,0.050357,-0.058016,-0.095673,0.029358,0.063399,-0.022012,-0.170434,-0.093853,0.074789,0.063942,-0.033589,-0.056124,-0.167528,-0.080016,-0.112979,-0.050972,0.0883,0.220223,0.062193,0.117783,0.011043,-0.029721,0.106423,0.051618,-0.222753,0.136785,-0.048457,0.011089,-0.008389,-0.063392,-0.088741,-0.015866,-0.037244,0.224762,0.063838,0.247152,0.285981,-0.01015,0.001337,-0.212766,-0.161998,-0.017284,-0.019699,0.152642,0.057652,0.075427,0.013559,-0.128663,0.006985,-0.010993,-0.034018,-0.020668,-0.049146,0.004404,0.011469,0.01091,-0.029276,-0.017918,-0.033886,-0.034399,0.003563,-0.031645,0.005313,-0.01462,-0.013181,-0.016351,-0.017334,-0.000351,-0.029102,-0.01758,0.032084,0.126293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.210603
5,-0.096483,-0.143322,-0.149789,0.018188,0.119646,0.080453,0.174186,0.046866,-0.007467,-0.080794,-0.078267,-0.070072,0.056432,0.078454,0.093125,0.038433,0.054273,-0.087719,-0.185936,-0.110731,-0.073262,-0.035908,0.080308,0.145312,0.184568,0.23332,0.180483,0.195615,0.182544,0.052966,-0.016316,0.056401,0.078747,0.031595,0.096676,0.157616,0.112127,-0.058702,-0.116454,-0.039143,0.005877,0.123708,0.045027,-0.258117,-0.114213,-0.181763,-0.082629,-0.039629,0.09776,0.033597,0.144244,0.151147,0.156802,-0.047261,0.043283,0.018409,0.087054,0.00173,-0.054501,-0.162537,-0.169961,-0.08737,0.028819,0.052449,0.067611,-0.233388,-0.067615,-0.104969,0.060157,-0.096811,-0.072128,-0.031109,0.078458,0.065061,0.122238,-0.011346,-0.095782,-0.117242,-0.157308,0.002595,-0.002121,-0.02963,-0.022589,-0.038144,-0.000717,0.013036,-0.002364,-0.01786,-0.014032,-0.040113,-0.040411,-0.009895,-0.036795,-0.004721,-0.010101,-0.00585,-0.021543,-0.022208,-0.009291,-0.024903,-0.004255,0.027359,0.131057,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.214332
6,-0.25342,-0.282784,-0.180796,-0.107719,0.044806,-0.036124,0.106875,0.129091,-0.015911,0.117281,-0.117294,0.046884,-0.035583,0.079327,0.00727,0.077297,-0.047777,0.11288,-0.140774,-0.128787,-0.185001,-0.146971,-0.138887,-0.040018,0.013749,0.199223,0.130255,0.125373,-0.062788,0.08552,0.054098,0.000283,-0.052449,-0.116634,0.034317,0.124093,0.148244,0.011686,-0.106041,-0.036927,0.010487,0.070239,0.035478,-0.054486,-0.08929,-0.143395,-0.059068,-0.062586,0.023623,0.097706,0.173776,0.158769,0.136139,0.091061,0.046198,0.005694,0.219379,0.111455,-0.000772,0.005677,-0.021895,-0.02756,0.083565,-0.114755,0.051469,0.003986,0.113201,0.100415,-0.093572,-0.144203,-0.207874,-0.203261,0.007301,-0.052102,0.019461,0.084845,0.074198,-0.072631,-0.093609,-0.005971,-0.068025,-0.028533,-0.001274,-0.023005,-0.003217,0.020817,-0.03024,-0.031922,0.009311,-0.063476,-0.073669,-0.0671,-0.021705,-0.063541,0.050087,-0.0427,0.073626,0.095997,0.030442,-0.02985,-0.016147,-0.033696,0.117002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.219858
7,-0.183731,-0.195944,-0.154715,-0.059055,0.077589,0.243455,0.142082,0.146985,0.046971,-0.092452,-0.094777,-0.168326,-0.010605,0.09099,0.189122,0.122119,0.153408,0.016591,-0.116603,-0.137195,-0.109229,-0.166096,-0.059297,-0.117435,-0.048793,0.042291,0.01972,-0.029983,-0.120303,0.031036,0.051984,-0.123478,-0.081108,-0.036929,0.047079,0.135717,0.13268,-0.013113,-0.085498,-0.087054,0.105766,0.098831,0.085083,-0.122368,-0.02886,-0.104647,-0.044358,0.047075,0.098086,0.064983,0.06221,0.037726,0.040573,0.015673,0.017591,-0.022054,0.143305,0.101794,0.028564,-0.014931,-0.020495,-0.077336,0.029985,0.069068,0.110314,-0.006339,0.031139,-0.052189,-0.205151,0.053517,-0.077472,-0.094407,0.074739,-0.02519,0.002498,0.020265,0.031952,0.006413,-0.050731,0.010261,-0.065656,-0.038285,0.031027,0.01658,-0.061394,0.064925,-0.004127,-0.060079,0.084924,0.046177,0.163798,0.417136,-0.073816,-0.006896,-0.066373,-0.051539,-0.065815,-0.065641,-0.030185,-0.074561,-0.073309,-0.006347,-0.007961,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.223315
8,-0.080961,-0.034301,-0.082809,0.140107,0.05463,0.261941,0.057939,0.240662,0.124766,-0.092193,-0.010755,-0.031434,-0.075058,0.22944,0.176864,0.267778,0.045184,0.122322,-0.056013,-0.09975,0.087519,-0.022989,-0.032427,0.143512,0.210412,0.048912,0.060345,0.008959,0.054179,-0.06138,0.036521,-0.042307,-0.09861,-0.150269,0.014007,0.083485,-0.072432,-0.127895,-0.149852,-0.042276,0.252177,-0.001752,-0.128037,0.10159,-0.11692,-0.115481,-0.008058,0.074249,0.035289,-0.015102,-0.034358,-0.059743,-0.061849,0.130952,-0.005376,-0.064676,0.051497,-0.06371,-0.085539,-0.124903,-0.137222,-0.084409,-0.068256,-0.043071,0.128147,0.104877,0.07993,0.079207,0.037931,-0.141979,5.6e-05,-0.023329,0.225417,0.120051,0.081139,-0.066551,-0.026568,0.094547,-0.19517,-0.002115,-0.043866,-0.060947,-0.011203,0.026289,0.031887,0.008568,-0.035643,-0.04343,-0.011608,0.006561,0.008937,0.037659,-0.049705,0.061571,-0.042326,-0.022349,-0.030051,-0.023114,0.015622,-0.054995,-0.018296,0.014501,0.110608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.227321
9,-0.076461,-0.110861,-0.044888,0.140263,0.180233,0.203281,0.090254,0.158931,0.152825,-0.045237,0.072124,0.072205,0.155549,0.214921,0.103413,0.140575,0.044001,0.040024,-0.085834,-0.090243,-0.00595,0.07052,0.117486,0.169685,0.183136,0.08282,0.017288,0.012843,0.104436,-0.025192,-0.077491,-0.124187,-0.075698,-0.076745,0.021168,0.063056,0.009477,-0.101562,-0.100163,-0.018974,0.038594,0.045948,0.01686,-0.006556,-0.077086,-0.095915,-0.049652,0.085769,0.010153,-0.036611,-0.03601,-0.035432,-0.013148,0.01899,0.003663,0.006082,-0.043793,-0.026452,0.000715,-0.101743,-0.230555,-0.185318,-0.044631,-0.171782,0.21102,0.077043,0.267048,0.259431,-0.010318,0.113252,-0.193173,-0.131387,0.112887,0.109167,0.190072,0.09828,-0.002846,-0.100783,-0.116131,0.002025,-0.015943,-0.017404,-0.007785,-0.041461,-0.000963,0.001347,-0.005968,-0.021875,-0.021775,-0.027963,-0.036266,-0.00429,-0.028749,0.008519,-0.00694,-0.016179,-0.017997,-0.013811,-0.009548,-0.032718,0.01796,0.023992,0.116674,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.234604


In [36]:
size = int(100*k)
print(str(size)+' vicini')

filter_col_Y_BB = ['BB_'+s for s in filter_col_Y]

synthetic_neighborhood2 = synthetic_neighborhood_generation(sampleKnn_mixed_space, size, discrete_var=[], continuous_var=filter_col_X, classes_name=filter_col_Y_BB)
BB_label_syn2_df = pd.DataFrame(best_rf.predict(synthetic_neighborhood2.values),columns=filter_col_Y_BB)
synthetic_neighborhood2 = pd.concat([synthetic_neighborhood2,BB_label_syn2_df],1)
synthetic_neighborhood2.head()

1400 vicini
there are continuos variables
there are no discrete variables


Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,Att41,Att42,Att43,Att44,Att45,Att46,Att47,Att48,Att49,Att50,Att51,Att52,Att53,Att54,Att55,Att56,Att57,Att58,Att59,Att60,Att61,Att62,Att63,Att64,Att65,Att66,Att67,Att68,Att69,Att70,Att71,Att72,Att73,Att74,Att75,Att76,Att77,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,BB_Class1,BB_Class2,BB_Class3,BB_Class4,BB_Class5,BB_Class6,BB_Class7,BB_Class8,BB_Class9,BB_Class10,BB_Class11,BB_Class12,BB_Class13,BB_Class14
0,-0.136622,-0.22248,-0.250527,-0.066919,0.086801,0.107156,0.155482,0.184088,-0.040427,-0.072391,-0.08177,-0.051414,-0.019758,0.023754,0.064444,0.170489,0.187352,0.123698,0.02241,-0.157021,-0.031474,0.058213,-0.023514,0.078417,0.206007,0.101663,0.102318,0.11904,0.061054,0.059444,-0.028836,-0.043167,-0.138649,-0.15949,0.18422,0.143528,-0.005467,0.019903,-0.028165,0.07654,-0.035352,0.022498,0.047918,-0.123545,-2.7e-05,-0.127568,0.061899,-0.003099,-0.033064,0.053728,-0.003851,0.127191,-0.017856,0.161741,-0.067282,-0.023761,0.083063,0.041949,-0.003591,0.026791,-0.062733,-0.150939,0.02624,-0.126892,0.021328,-0.019288,0.173628,-0.103967,0.081102,-0.08666,-0.126132,-0.044167,-0.06409,0.071973,0.066113,-0.010145,0.01153,0.146392,-0.118763,0.024212,-0.025417,-0.098745,-0.004067,-0.047145,-0.010924,0.025191,0.034011,-0.108983,0.009301,-0.008317,-0.006376,0.029667,-0.02402,-0.004057,0.035863,-0.027774,-0.047968,-0.052811,-0.003336,-0.014663,-0.011343,0.078146,0.098406,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.223368,-0.198336,-0.170279,0.240206,0.033198,0.152873,0.125372,0.075688,0.018928,-0.072796,-0.117899,-0.188932,0.082224,0.200983,0.1672,0.107225,-0.094212,0.01877,0.034833,-0.101755,-0.126275,0.047425,0.006269,0.238825,0.10087,0.095217,0.107782,0.116671,-0.018863,0.024417,0.043943,0.042734,-0.091176,0.005778,0.077682,0.120025,0.097551,-0.121531,-0.074511,-0.04107,-0.107513,0.057188,0.206204,0.041087,-0.108819,-0.208106,0.106107,0.072484,-0.013334,0.124486,0.085337,0.137449,0.146924,0.015367,0.12771,-0.054983,0.05412,0.055968,-0.028933,0.008709,-0.066865,-0.06968,0.072894,-0.078641,0.138551,-0.089544,-0.058138,0.253557,0.046241,-0.003448,0.043997,-0.098792,-0.022393,-0.08839,0.045789,0.025717,0.051174,0.096279,-0.048708,0.031591,-0.00782,0.040119,-0.034289,-0.032084,0.03798,0.011419,0.019941,0.020577,-0.022044,-0.027544,-0.09888,0.268531,-0.004773,-0.045105,0.004769,0.007237,-0.00461,0.001965,-0.081531,-0.044937,0.000371,0.035945,0.151869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.194656,-0.180526,-0.285254,-0.235925,0.184712,0.202142,0.091122,0.11863,-0.052003,0.011894,-0.14597,0.013152,-0.019789,0.084846,0.17839,0.006286,0.126691,-0.01995,0.050182,-0.045052,-0.151946,-0.179118,-0.118287,-0.057961,0.103108,0.140164,0.078914,-0.0462,0.067028,0.041801,-0.104615,-0.082678,-0.216844,-0.082621,-0.06058,0.138972,0.102319,0.090662,-0.129184,0.060384,0.167095,0.098422,0.022983,0.003723,-0.038023,-0.00319,-0.070723,0.128255,0.202968,-0.131176,0.011853,0.039714,-0.045123,-0.135267,0.015329,0.023926,0.051509,-0.142929,0.037921,-0.072117,-0.072797,-0.072904,-0.089919,-0.04644,-0.009805,0.022138,0.062531,-0.009962,0.042189,-0.135698,0.016922,-0.205684,0.010703,-0.017911,0.074712,0.040683,0.172265,-0.096133,-0.122428,-0.034257,-0.039689,-0.085732,-0.049426,-0.025057,0.049077,0.033732,0.011298,-0.012467,-0.024149,-0.02188,-0.014541,-0.058226,-0.039595,-0.008914,-0.055616,-0.009979,0.002208,-0.114753,0.001418,-0.065648,-0.022792,0.103314,0.136016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-0.190756,-0.179304,-0.191362,-0.047478,0.075833,0.141685,0.081047,0.087267,-0.09622,-0.248067,-0.194537,-0.084555,0.184539,0.090232,0.055594,0.120544,-0.015352,-0.010092,-0.057241,-0.16771,0.112249,-0.079055,0.01684,-0.07379,0.265202,0.200657,0.075558,0.029444,0.006893,0.011349,-0.036154,-0.01048,-0.04277,-0.145449,0.203655,-0.011192,0.230622,-0.068966,-0.118658,0.101896,0.120364,0.068552,0.00279,-0.057438,-0.122878,0.03261,-0.036912,0.06026,0.131471,0.018117,0.090372,0.051531,0.072008,-0.062259,-0.010995,0.073339,0.093037,-0.067339,-0.057346,-0.069518,-0.207638,-0.189517,-0.116968,0.028752,0.045007,0.115832,0.18668,-0.017073,-0.002843,-0.213738,-0.179261,-0.029135,0.112372,0.066608,-0.074731,0.028871,0.054056,-0.061271,-0.014551,-0.05742,-0.017092,-0.070035,-0.090907,0.012861,0.06223,0.088538,0.004244,-0.062537,-0.044793,0.004117,-0.160828,0.093498,-0.015568,-0.004257,0.010527,-0.010124,0.011146,-0.041,-0.050318,-0.079446,-0.04863,0.013018,0.148551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.203682,-0.180515,-0.260045,0.028374,0.075375,0.211242,0.116148,0.038535,-0.03458,-0.185424,-0.226416,-0.055588,0.017664,0.124285,0.152023,0.023518,0.116541,0.021759,-0.170305,-0.060207,-0.026842,-0.104532,0.10649,0.05052,0.073865,0.031094,0.072586,0.070875,0.214521,-0.00256,-0.020425,-0.056739,0.007709,-0.047489,0.053522,0.148538,0.052482,-0.027845,-0.08982,0.089511,0.060712,0.045533,0.044864,0.036773,-0.123195,0.032365,-0.108771,0.122814,0.024026,0.021292,0.001541,0.006727,-0.017067,0.027845,-0.01375,-0.072874,0.111743,0.071373,0.010618,-0.048215,0.027933,-0.053251,-0.036068,-0.021152,0.115464,-0.040949,0.062395,0.097466,0.126651,-0.204006,-0.048551,-0.060147,0.051355,-0.044365,0.300595,0.098897,0.070533,-0.006404,0.027654,0.018683,0.005189,-0.056924,-0.02984,0.012451,0.034832,0.073888,0.027678,-0.035451,-0.014354,-0.066937,-0.00446,0.116958,-0.036389,-0.013805,-0.034357,-0.046268,0.00164,0.013548,-0.02146,-0.008693,-0.049393,0.033775,0.112726,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


### Faccio crescere gli alberi multilabel sui miei due vicinati sintetici

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
#fittlo il mio DT su tutto il neighborhood sintetico
tree1 = DecisionTreeClassifier()
tree1.fit(synthetic_neighborhood1.drop(filter_col_Y_BB,1).values,synthetic_neighborhood1.drop(filter_col_X,1).values)

tree2 = DecisionTreeClassifier()
tree2.fit(synthetic_neighborhood2.drop(filter_col_Y_BB,1).values,synthetic_neighborhood2.drop(filter_col_X,1).values)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Testo le performances dei miei tree

#### Fidelity sul synthetic neighborhood

In [39]:
y_syn1 = synthetic_neighborhood1.drop(filter_col_X,1).values
y_tree1 = tree1.predict(synthetic_neighborhood1.drop(filter_col_Y_BB,1).values)
metrics.f1_score(y_true=y_syn1,y_pred=y_tree1,average='micro')

1.0

In [40]:
y_syn2 = synthetic_neighborhood2.drop(filter_col_X,1).values
y_tree2 = tree2.predict(synthetic_neighborhood2.drop(filter_col_Y_BB,1).values)
metrics.f1_score(y_true=y_syn2,y_pred=y_tree2,average='micro')

1.0

#### Fidelity sul sample di kNN usato per generare i dataset sintetici

In [41]:
y_samplekNN1 = best_rf.predict(alpha_beta_sample_knn.drop(filter_col_Y_BB,1).values)
y_tree1 = tree1.predict(alpha_beta_sample_knn.drop(filter_col_Y_BB,1).values)
metrics.f1_score(y_true=y_samplekNN1,y_pred=y_tree1,average='micro')

0.8095238095238095

In [42]:
y_samplekNN2 = best_rf.predict(sampleKnn_mixed_space.drop(filter_col_Y_BB,1).drop('mixed_space_dist',1).values)
y_tree2 = tree2.predict(sampleKnn_mixed_space.drop(filter_col_Y_BB,1).drop('mixed_space_dist',1).values)
metrics.f1_score(y_true=y_samplekNN2,y_pred=y_tree2,average='micro')

0.8048780487804877

#### Su i2e HIT, ed estraggo la regola

In [43]:
i2e[filter_col_Y_BB].values

array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.])

In [44]:
#per i due sintetici
print(i2e[filter_col_Y_BB].values.reshape(1, -1))
print()
print(tree1.predict(i2e[filter_col_X].values.reshape(1, -1)))
print(tree2.predict(i2e[filter_col_X].values.reshape(1, -1)))

[[0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]]


In [45]:
synthetic_neighborhood1.drop(filter_col_Y_BB,1).shape

(1400, 103)

Cerco di estrarre la regola dal tree1 che ha portato a questa classificazione

In [46]:
i2e_values = i2e[filter_col_X].values.reshape(1, -1)
tree1.predict(i2e_values)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.]])

In [47]:
features_names = synthetic_neighborhood1.drop(filter_col_Y_BB,1).columns.values

In [48]:
def istance_rule_extractor(i2e_values, DT, features_names):
    """this function takes:
    i2e_values: np.array, shape=(1, -1) containing values of that instance features
    DT: pre-trained decision tree from sklearn
    features_name: list of features names
    
    and returns a rule (str) describing why that instance was classified in that way by the DT and rule lenght (int)
    """
    
    
    n_nodes = DT.tree_.node_count
    #print('numero di nodi nel tree: '+str(n_nodes))
    children_left = DT.tree_.children_left
    children_right = DT.tree_.children_right
    feature = DT.tree_.feature
    threshold = DT.tree_.threshold
    
    #estraggo il path di nodi seguiti per arrivare alla foglia che contiene il mio esempio
    node_indicator = DT.decision_path(i2e_values)
    #print('path:')
    #print(node_indicator)
    #node indicator contiene una matice con sulla prima colonna la tupla (id_sample,node_id) in questo caso è il path di un 
    #solo esempio quindi id_sample=0, invece node_id contiente tutti i nodi utilizzati da quella istanza


    #trovo l'id del nodo che è la foglia dove cade il mio esempio
    leave_id = DT.apply(i2e_values)
    #leave_id è una vettore al cui posto i-esimo si trova l'id del nodo-foglia in cui cade l'esempio i-esimo 
    #in questo caso l'esempio è solo uno, quindi è un'array lunga 1
    print('id leaf node: '+ str(leave_id[0]))

    #qui trovo in nodi usati
    node_index = node_indicator.indices
    
    #ho solo un sample, quindi il suo id è per forza 0
    sample_id = 0

    #salvo le split conditions in una lista
    list_split_conditions = list()

    for node_id in node_index:
        #controllo che non siamo già in una foglia
        if leave_id[sample_id] == node_id:  
            print("leaf node {} reached, no decision here".format(leave_id[sample_id]))
            break
        else:
            #se il valore di quella feature in quella istanza è minore della treshold 
            if i2e_values[0][feature[node_id]] <= threshold[node_id]:
                threshold_sign = " <= "
            else:
                threshold_sign = " > "
            
            list_split_conditions.append(str(features_names[feature[node_id]])+'='+str(round(i2e_values[0][feature[node_id]],2))+threshold_sign+str(round(threshold[node_id],2)))
            print("nel nodo "+str(node_id)+' si ha che '+str(features_names[feature[node_id]])+'='+str(round(i2e_values[0][feature[node_id]],2))+threshold_sign+str(round(threshold[node_id],2)))
            
    return ', '.join(list_split_conditions)+' -> '+str(DT.predict(i2e_values)[0]), len(list_split_conditions)

In [49]:
istance_rule_extractor(i2e_values, tree1, features_names=features_names)

id leaf node: 184
nel nodo 0 si ha che Att67=0.05 <= 0.08
nel nodo 1 si ha che Att77=0.1 > 0.02
nel nodo 133 si ha che Att65=0.04 <= 0.05
nel nodo 134 si ha che Att76=0.15 > 0.11
nel nodo 182 si ha che Att82=-0.03 > -0.07
leaf node 184 reached, no decision here


('Att67=0.05 <= 0.08, Att77=0.1 > 0.02, Att65=0.04 <= 0.05, Att76=0.15 > 0.11, Att82=-0.03 > -0.07 -> [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]',
 5)

In [50]:
istance_rule_extractor(i2e_values, tree2, features_names=features_names)

id leaf node: 246
nel nodo 0 si ha che Att67=0.05 <= 0.1
nel nodo 1 si ha che Att77=0.1 > 0.02
nel nodo 103 si ha che Att35=0.18 > 0.12
nel nodo 211 si ha che Att3=-0.19 <= -0.09
nel nodo 212 si ha che Att10=-0.11 > -0.18
nel nodo 218 si ha che Att96=-0.01 > -0.01
nel nodo 240 si ha che Att45=-0.05 > -0.13
nel nodo 242 si ha che Att51=0.02 > -0.07
leaf node 246 reached, no decision here


('Att67=0.05 <= 0.1, Att77=0.1 > 0.02, Att35=0.18 > 0.12, Att3=-0.19 <= -0.09, Att10=-0.11 > -0.18, Att96=-0.01 > -0.01, Att45=-0.05 > -0.13, Att51=0.02 > -0.07 -> [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]',
 8)

### Calcolo la fidelity con la BB di un multi-label DT cresciuto su tutto X2E

In [51]:
global_DT = DecisionTreeClassifier()

In [52]:
X_global = X2E[filter_col_X].values
Y_global = X2E[filter_col_Y_BB].values

In [53]:
global_DT.fit(X_global,Y_global)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [54]:
y_global_DT = global_DT.predict(X_global)
metrics.f1_score(y_true=Y_global,y_pred=y_global_DT,average='micro')

1.0