In [93]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
from sklearn.model_selection import train_test_split

In [94]:
dfc = pd.read_csv('Data/dataset_moderate_relevance.csv', sep=';')
dfc.head(5)

Unnamed: 0,perc_in_name,in_component,in_system,Relevance,long_common_name,query,component,system
0,0.333333,0,0,0,Leukocytes [#/volume] in Blood,bilirubin in plasma,Leukocytes,Bld
1,0.666667,0,1,1,Aspartate aminotransferase [Enzymatic activity...,bilirubin in plasma,Aspartate aminotransferase,Ser/Plas
2,0.666667,0,1,1,Fasting glucose [Mass or Moles/volume] in Seru...,bilirubin in plasma,Glucose^post CFst,Ser/Plas
3,0.333333,0,0,0,Lymphocytes/100 leukocytes in Blood,bilirubin in plasma,Lymphocytes/100 leukocytes,Bld
4,0.666667,1,0,1,Bilirubin.total [Mass/volume] in Synovial fluid,bilirubin in plasma,Bilirubin,Synv fld


In [95]:
dfc['Relevance'].value_counts()

0    32
1    28
Name: Relevance, dtype: int64

In [96]:
#which are the different queries in our dataset?
print(dfc["query"].unique())
q = "glucose in blood"

['bilirubin in plasma' 'glucose in blood' 'white blood cells count']


In [97]:
df = dfc.loc[dfc['query'] != q]
df.head(100)

Unnamed: 0,perc_in_name,in_component,in_system,Relevance,long_common_name,query,component,system
0,0.333333,0,0,0,Leukocytes [#/volume] in Blood,bilirubin in plasma,Leukocytes,Bld
1,0.666667,0,1,1,Aspartate aminotransferase [Enzymatic activity...,bilirubin in plasma,Aspartate aminotransferase,Ser/Plas
2,0.666667,0,1,1,Fasting glucose [Mass or Moles/volume] in Seru...,bilirubin in plasma,Glucose^post CFst,Ser/Plas
3,0.333333,0,0,0,Lymphocytes/100 leukocytes in Blood,bilirubin in plasma,Lymphocytes/100 leukocytes,Bld
4,0.666667,1,0,1,Bilirubin.total [Mass/volume] in Synovial fluid,bilirubin in plasma,Bilirubin,Synv fld
5,0.333333,0,0,0,Glucose [Moles/volume] in Pleural fluid,bilirubin in plasma,Glucose,Plr fld
6,0.666667,0,1,1,Glucose [Moles/volume] in Serum or Plasma --3 ...,bilirubin in plasma,Glucose^3H post 100 g glucose PO,Ser/Plas
7,0.333333,0,0,0,Hepatitis B virus DNA [#/volume] (viral load) ...,bilirubin in plasma,Hepatitis B virus DNA,Ser
8,1.0,1,1,1,Bilirubin.total [Mass/volume] in Serum or Plasma,bilirubin in plasma,Bilirubin,Ser/Plas
9,1.0,1,1,1,Bilirubin.indirect [Mass or Moles/volume] in S...,bilirubin in plasma,Bilirubin.non-glucuronidated,Ser/Plas


In [99]:
# Separate input features (X) and target variable (y)
y = df.Relevance
X = df.drop(['Relevance', 'long_common_name', 'query', 'component', 'system'], axis=1)

In [100]:
#split data into training and test set
seed = 42
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)

In [101]:
print(y_train.value_counts())
print(y_test.value_counts())

0    15
1    13
Name: Relevance, dtype: int64
0    7
1    5
Name: Relevance, dtype: int64


In [102]:
# Train model
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
# Predict on test set
y_pred = rf.predict(X_test)

In [104]:
# Is our model still predicting just one class?
print( np.unique( y_pred ) )

[0 1]


In [105]:
# How's our accuracy?
print( accuracy_score(y_test, y_pred) )

1.0


In [106]:
# What about AUROC?
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print(roc_auc)

1.0


In [107]:
#confusion matrix
print(confusion_matrix(y_test, y_pred))

[[7 0]
 [0 5]]


In [108]:
newdata = dfc.loc[dfc['query'] == q]
newdata.head(100)

Unnamed: 0,perc_in_name,in_component,in_system,Relevance,long_common_name,query,component,system
20,0.333333,0,0,0,Bilirubin.total [Mass/volume] in Serum or Plasma,glucose in blood,Bilirubin,Ser/Plas
21,0.666667,1,0,1,Glucose [Moles/volume] in Urine,glucose in blood,Glucose,Urine
22,1.0,1,1,1,Glucose [Mass/volume] in Serum Plasma or Blood,glucose in blood,Glucose,Ser/Plas/Bld
23,0.333333,0,0,0,Aspartate aminotransferase [Enzymatic activity...,glucose in blood,Aspartate aminotransferase,Ser/Plas
24,0.333333,0,0,0,Calcium bilirubinate/Total in Stone,glucose in blood,Calcium bilirubinate/Total,Calculus
25,0.666667,1,0,1,Glucose [Moles/volume] in Pleural fluid,glucose in blood,Glucose,Plr fld
26,0.333333,0,0,0,Bilirubin.total [Mass/volume] in Synovial fluid,glucose in blood,Bilirubin,Synv fld
27,0.666667,1,0,1,Glucose [Moles/volume] in Serum or Plasma,glucose in blood,Glucose,Ser/Plas
28,0.666667,0,1,1,Leukocytes [#/volume] in Blood,glucose in blood,Leukocytes,Bld
29,0.333333,0,0,0,Tyrosine aminotransferase [Mass/volume] in Plasma,glucose in blood,Tyrosine aminotransferase,Plas


In [109]:
newdata.shape

(20, 8)

In [110]:
features = newdata.drop(['Relevance', 'long_common_name', 'query', 'component', 'system'], axis=1)

In [111]:
#make predictions on the new data
predictions = rf.predict(features)
print(predictions)

[0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 0 1 0]


In [112]:
indices = np.where(predictions == 1)
results = newdata.iloc[indices[0]]
results[['query', 'long_common_name']]

Unnamed: 0,query,long_common_name
21,glucose in blood,Glucose [Moles/volume] in Urine
22,glucose in blood,Glucose [Mass/volume] in Serum Plasma or Blood
25,glucose in blood,Glucose [Moles/volume] in Pleural fluid
27,glucose in blood,Glucose [Moles/volume] in Serum or Plasma
28,glucose in blood,Leukocytes [#/volume] in Blood
30,glucose in blood,Lymphocytes [#/volume] in Blood
31,glucose in blood,Glucose [Moles/volume] in Serum or Plasma --3 ...
35,glucose in blood,Monocytes [#/volume] in Blood
36,glucose in blood,Lymphocytes/100 leukocytes in Blood
38,glucose in blood,Fasting glucose [Mass or Moles/volume] in Seru...


In [113]:
# knowing in our case the label of the incoming data we can test the performance of the trained model
# in real case scenarios the label Relevance of the new query is not known in advance
newdata.Relevance
print(accuracy_score(newdata.Relevance, predictions))

1.0
