In [4]:
from helper_datafusion import *
db_mortality = pd.read_csv('data/lc_db_processed.csv')
dist_datafusion = pd.read_csv('data/datafusion_distances.csv').drop(columns = 'Unnamed: 0')
accs = []
rmse = []
sens = []
precs = []
specs = []

kf = KFold(n_splits = 5, shuffle = True)
for train_index, test_index in kf.split(db_mortality):
    X_train = dist_datafusion.iloc[train_index]
    X_test = dist_datafusion.iloc[test_index]
    
    y_train = np.where(db_mortality.conc1_mean.iloc[train_index]>1,1,0)
    y_test =  np.where(db_mortality.conc1_mean.iloc[test_index]>1,1,0)
    
    clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accs.append(accuracy_score(y_test, y_pred))
    rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
    sens.append(recall_score(y_test, y_pred))
    precs.append(precision_score(y_test, y_pred))
    specs.append(tn/(tn+fp)) 
        
avg_acc = np.mean(accs)
se_acc = sem(accs)

avg_rmse = np.mean(rmse)
se_rmse = sem(rmse)

avg_sens = np.mean(sens)
se_sens = sem(sens)

avg_precs = np.mean(precs)
se_precs = sem(precs)

avg_specs = np.mean(specs)
se_specs = sem(specs)

print('''Accuracy: \t {}, se: {}
RMSE: \t\t {}, se: {}
Sensitivity: \t {}, se: {}
Precision: \t {}, se: {}
Specificity: \t {}, se: {}'''.format(avg_acc, se_acc, avg_rmse, se_rmse, avg_sens, se_sens,
                                 avg_precs, se_precs, avg_specs, se_specs))


 Accuracy: 	 0.8980482894941753, se: 0.0018325912430409545
RMSE: 		 0.31924754468619077, se: 0.0028598827626033878
Sensitivity: 	 0.9267267034000574, se: 0.0022623920309544515
Precision: 	 0.9107334547713885, se: 0.0018184925686037222
Specificity: 	 0.8510175890284625, se: 0.003422215436299718


In [1]:
from helper_datafusion import *

db_mortality, db_datafusion = load_datafusion_datasets('data/lc_db_processed.csv', 'data/datafusion_db_processed.csv')

categorical = ['class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
               'application_freq_unit',"exposure_type", "conc1_type", 'obs_duration_mean']

non_categorical = ['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
               'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']

X = db_mortality.drop(columns = 'conc1_mean').copy()
y = db_mortality.conc1_mean.values

del db_mortality

acc, se_acc, rmse, se_rmse = cv_datafusion_rasar(X, y, db_datafusion)

Computing distance matrix... Wed Sep 23 16:24:27 2020
Start CV... Wed Sep 23 16:36:38 2020
...END DataFusion RASAR Wed Sep 23 18:29:05 2020
Accuracy: 	 0.9158905337316472, se: 0.0010349961873023007
    RMSE: 		 0.28999439474586375, se: 0.0017830650810416518
    Sensitivity: 	 0.9401460366673255, se: 0.0012092713925737277
    Precision: 	 0.9255970378295088, se: 0.002227388127143637
    Specificity: 	 0.8761245417891879, se: 0.003097701308387638


In [1]:
from helper_datafusion import *

def find_similar(exp_mortality, db_datafusion, compare_features):
    sim_exp = (db_datafusion[compare_features] == exp_mortality).all(axis = 1)
    out = db_datafusion.conc1_mean[sim_exp].index.tolist()
    if len(out) != 0:
        return db_datafusion.conc1_mean[sim_exp].values[0]
    else:
        return 'Unknown'

db_mortality = pd.read_csv('data/lc_db_processed.csv').drop(columns = 'Unnamed: 0')
db_datafusion = pd.read_csv('data/datafusion_db_processed.csv').drop(columns = 'Unnamed: 0')

comp_feat = ['test_cas', 'obs_duration_mean', 'conc1_type', 'fish', 'exposure_type',
                     'control_type', 'media_type', 'application_freq_unit']

db_mortality['fish'] = db_mortality['class'] + ' ' + db_mortality['tax_order'] + ' ' + db_mortality['family'] + ' ' +\
                       db_mortality['genus'] + ' ' + db_mortality['species']

db_datafusion['fish'] = db_datafusion['class'] + ' ' + db_datafusion['tax_order'] + ' ' + db_datafusion['family'] + ' ' +\
                       db_datafusion['genus'] + ' ' + db_datafusion['species']

In [99]:
a = db_mortality.apply(lambda x: find_similar(x[comp_feat], db_datafusion, comp_feat), axis = 1)

In [106]:
len(a[a == 'Unknown'])

26087

In [105]:
len(a[a != 'Unknown'])

759