## Read labels CSV

In [124]:
import numpy as np
import pandas as pd

def read_dataset(path,drop_columns=None,keep_columns=None):
    #get rid of useless columns
    csv_data = pd.read_csv(path)
    
    if keep_columns != None:
        #keep only these columns
        return csv_data.filter(items=keep_columns)
    
    if drop_columns!= None:
        #drop these and keep the rest
        return csv_data.drop(drop_columns, axis=1)
    
    #finally, didn't drop or filter any column
    return csv_data

df_ds = read_dataset("./openml_203ds_datasets_matching.csv",keep_columns=["'dataset1_id'", "'dataset2_id'","'matching_topic'"]);
df_not_matching = df_ds[df_ds["'matching_topic'"] == 0 ].to_numpy()
df_matching = df_ds[df_ds["'matching_topic'"] == 1 ].to_numpy()

In [190]:
print(len(df_not_matching))
print(len(df_matching))

19758
543


## Get splits

In [194]:
import numpy as np
from sklearn.model_selection import KFold
def get_splits(data,splits=14):
    kf = KFold(n_splits=splits,shuffle=True)
    neg_samples = []
    i = []
    #for train_index, test_index in kf.split(data.index.values.tolist()):
    for train_index, test_index in kf.split(data):
        _train = data[train_index]
        _test = data[test_index]
        break
    return _train,_test

def concat_shuffle(a1,a2):
    out = np.concatenate((a1,a2)) 
    np.random.shuffle(out)
    return out

discard,neg_set = get_splits(df_not_matching,10)
neg_train,neg_test = get_splits(neg_set,7)
pos_train,pos_test = get_splits(df_matching,6)

train = concat_shuffle(pos_train,neg_train)
test = concat_shuffle(pos_test,neg_test)
print(len(train))
print(len(test))
print(len(discard))

2145
374
17782


In [197]:
#control there are not overlapping pairs
def overlapping_pairs(data):
    control = []
    for r in data:
        comb1 = str("{}_{}".format(r[0],r[1]))
        comb2 = str("{}_{}".format(r[1],r[0]))
        if comb1 in control or comb2 in control:
            return True
        else:
            control.append(comb1)
            control.append(comb2)
    return False

print(overlapping_pairs(train))
print(overlapping_pairs(test))
print(overlapping_pairs(discard))

False
False
False


## Write files

In [193]:
df_train = pd.DataFrame(data=train,columns=["ds1","ds2","matching"]) 
df_test = pd.DataFrame(data=test,columns=["ds1","ds2","matching"]) 
df_test_only_negative = pd.DataFrame(data=discard,columns=["ds1","ds2","matching"]) 
df_train.to_csv("train.csv",index=False)
df_test.to_csv("test.csv",index=False)
df_test_only_negative.to_csv("test_only_negative.csv",index=False)
print("Done")