In [0]:
# RQE Analysis
# Ravi Bathla

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import nltk
import re

In [3]:
# Mount Drive containing train and test XML files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Import Train and Text XML files
def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = ET.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        res.append(node.attrib.get(df_cols[0]))
        res.append(node.attrib.get(df_cols[1]))
        res.append(node.attrib.get(df_cols[2]))
        for el in df_cols[3:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [0]:
train = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
val = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Test_302_pairs_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
test = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/MEDIQA2019-Task2-RQE-TestSet-wLabels.xml', ['pid', 'type', 'value', 'chq', 'faq'])

In [6]:
train.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...


In [7]:
# Binarize outcome variable
train['outcome'] = np.where(train['value'] == 'true', 1, 0)
val['outcome'] = np.where(val['value'] == 'true', 1, 0)
test['outcome'] = np.where(test['value'] == 'true', 1, 0)
train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...,1
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...,1
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...,0
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...,1
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...,0


## Text Preprocessing


In [8]:
# Remove punctuation and non-necessary characters
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

def preprocess_text(text):
  text = re.sub('[^A-Za-z]', ' ', text)

  # Convert all to lowercase
  text = text.lower()

  # Tokenize
  tokenized_text = word_tokenize(text)

  # Remove stopwords
  for word in tokenized_text:
      if word in stopwords.words('english'):
          tokenized_text.remove(word)

  # Stem
  stemmer = PorterStemmer()
  for i in range(len(tokenized_text)):
      tokenized_text[i] = stemmer.stem(tokenized_text[i])

  # List of words
  p_text = " ".join(tokenized_text)

  return p_text

train_processed = train.copy()
X_train = train_processed[['chq', 'faq']]
y_train = train_processed['outcome']

X_train['chq'] = train.apply(lambda x: preprocess_text(x['chq']), axis=1)
X_train['faq'] = train.apply(lambda x: preprocess_text(x['faq']), axis=1)

val_processed = val.copy()
X_val = val_processed[['chq', 'faq']]
y_val = val_processed['outcome']

X_val['chq'] = val.apply(lambda x: preprocess_text(x['chq']), axis=1)
y_val['faq'] = val.apply(lambda x: preprocess_text(x['faq']), axis=1)

test_processed = test.copy()
X_test = test_processed[['chq', 'faq']]
y_test = test_processed['outcome']

X_test['chq'] = test.apply(lambda x: preprocess_text(x['chq']), axis=1)
X_test['faq'] = test.apply(lambda x: preprocess_text(x['faq']), axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value i

In [9]:
X_train.head()

Unnamed: 0,chq,faq
0,should treat polymenorrhea a year old girl,should treat polymenorrhea a year old girl
1,there ani studi low molecular weight heparin p...,i use low molecular weight heparin pregnanc pa...
2,there ani studi low molecular weight heparin p...,are side effect florinef could caus headach
3,let give immun s right t,let give immun s right t
4,let give immun s right t,there support can provid patient macular degener


In [10]:
X_val.shape

(302, 2)

In [11]:
y_val.shape

(303,)

In [12]:
y_val.tail

<bound method NDFrame.tail of 0                                                      0
1                                                      0
2                                                      0
3                                                      0
4                                                      0
                             ...                        
298                                                    0
299                                                    0
300                                                    0
301                                                    0
faq    0                                  is high blo...
Name: outcome, Length: 303, dtype: object>

In [13]:
# artifact at the end of y_val, can drop
y_val = y_val[:-1]
y_val.shape

(302,)

## BoW

In [14]:
# chq text
from sklearn.feature_extraction.text import CountVectorizer
X_train_chq = X_train.chq
matrix = CountVectorizer(min_df=5, max_df=0.7).fit(X_train_chq)
X_train_chq = pd.DataFrame(matrix.transform(X_train_chq).todense(), columns=matrix.get_feature_names())
X_train_chq.shape 

(8588, 2417)

In [15]:
# transform X_val (DON'T FIT)
X_val_chq = X_val.chq
X_val_chq = pd.DataFrame(matrix.transform(X_val_chq).todense(), columns=matrix.get_feature_names())
X_val_chq.shape 

(302, 2417)

In [16]:
# transform X_test (DON'T FIT)
X_test_chq = X_test.chq
X_test_chq = pd.DataFrame(matrix.transform(X_test_chq).todense(), columns=matrix.get_feature_names())
X_test_chq.shape 

(230, 2417)

In [17]:
# faq text
X_train_faq = X_train.faq
matrix = CountVectorizer(min_df=5, max_df=0.7).fit(X_train_faq)
X_train_faq = pd.DataFrame(matrix.transform(X_train_faq).todense(), columns=matrix.get_feature_names())
X_val_faq = X_val.faq
X_val_faq = pd.DataFrame(matrix.transform(X_val_faq).todense(), columns=matrix.get_feature_names())
X_test_faq = X_test.faq
X_test_faq = pd.DataFrame(matrix.transform(X_test_faq).todense(), columns=matrix.get_feature_names())

# concatenate
X_train_bow = pd.concat([X_train_chq, X_train_faq], axis=1)
X_val_bow = pd.concat([X_val_chq, X_val_faq], axis=1)
X_test_bow = pd.concat([X_test_chq, X_test_faq], axis=1)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(8588, 2)
(302, 2)
(230, 2)


In [18]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train_bow, y_train)

#Predict the response for val dataset
y_pred_val = gnb.predict(X_val_bow)

#Predict the response for test dataset
y_pred_test = gnb.predict(X_test_bow)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Validation Accuracy:",y_pred_val[y_pred_val == y_val].shape[0]/y_pred_val.shape[0])
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred_test))

Validation Accuracy: 0.543046357615894
Test Accuracy: 0.5


## Fuzzy Matching

In [19]:
!pip3 install fuzzywuzzy[speedup]
from fuzzywuzzy import fuzz

def get_ratio(row):
    chq = row['chq']
    faq = row['faq']
    return fuzz.token_set_ratio(chq, faq)
  
X_train_fuzzy = X_train.copy()
X_val_fuzzy = X_val.copy()
X_test_fuzzy = X_test.copy()

X_train_fuzzy['fuzzy_ratio'] = X_train_fuzzy.apply(get_ratio, axis=1)
X_train_fuzzy.head()



Unnamed: 0,chq,faq,fuzzy_ratio
0,should treat polymenorrhea a year old girl,should treat polymenorrhea a year old girl,100
1,there ani studi low molecular weight heparin p...,i use low molecular weight heparin pregnanc pa...,100
2,there ani studi low molecular weight heparin p...,are side effect florinef could caus headach,32
3,let give immun s right t,let give immun s right t,100
4,let give immun s right t,there support can provid patient macular degener,42


In [0]:
X_val_fuzzy['fuzzy_ratio'] = X_val_fuzzy.apply(get_ratio, axis=1)
X_test_fuzzy['fuzzy_ratio'] = X_test_fuzzy.apply(get_ratio, axis=1)

In [21]:
print(X_train_fuzzy.shape)
print(X_val_fuzzy.shape)
print(X_test_fuzzy.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(8588, 3)
(302, 3)
(230, 3)
(8588,)
(302,)
(230,)


In [0]:
# just use fuzzy ratio as predictor
x_train_fuzzy = np.array(X_train_fuzzy.fuzzy_ratio).reshape(-1, 1)
#y_train_fuzzy = train['outcome']
x_val_fuzzy = np.array(X_val_fuzzy.fuzzy_ratio).reshape(-1, 1)
y_val_fuzzy = val['outcome']
x_test_fuzzy = np.array(X_test_fuzzy.fuzzy_ratio).reshape(-1, 1)
y_test_fuzzy = test['outcome']

In [23]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(x_train_fuzzy, y_train)

#Predict the response for val dataset
y_pred_val = gnb.predict(x_val_fuzzy)

#Predict the response for test dataset
y_pred_test = gnb.predict(x_test_fuzzy)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Validation Accuracy:",y_pred_val[y_pred_val == y_val].shape[0]/y_pred_val.shape[0])
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred_test))

Validation Accuracy: 0.6125827814569537
Test Accuracy: 0.4956521739130435


In [24]:
y_pred = gnb.predict(x_train_fuzzy)
print("Train Accuracy:",metrics.accuracy_score(y_train, y_pred))

Train Accuracy: 0.9796227293898463


## Rules Based Hypothesis Testing

•	Take the RQE data: https://raw.githubusercontent.com/abachaa/RQE_Data_AMIA2016/master/RQE_Train_8588_AMIA2016.xml 

•	Divide randomly  this data into training and testing (test data set should have at least 1500 data points). Do not use the original test data.

•	Using only regular expressions, counting, fraction and other simple arithmetic, create a collection of  if-then-else clauses to build a classifier that significantly improves on the majority classifier.


•	I suggest each person independently should come with such rules, and note the reason for them, and then the group combines it intelligently into a tree of rules. 

•	Only use your test data to get accuracy numbers; do not look at them to modify your rules. 


In [25]:
new_data = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
new_data.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...


In [26]:
print(new_data.shape)

(8588, 5)


In [27]:
1500/8588

0.17466231951560315

In [28]:
# clean up text

def preprocess_text(text):
  text = re.sub('[^A-Za-z]', ' ', text)

  # Convert all to lowercase
  text = text.lower()

  return text


new_data['chq'] = new_data.apply(lambda x: preprocess_text(x['chq']), axis=1)
new_data['faq'] = new_data.apply(lambda x: preprocess_text(x['faq']), axis=1)

new_data.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,how should i treat polymenorrhea in a y...,how should i treat polymenorrhea in a y...
1,2,originalQ-shortQ,True,have there been any studies with low molec...,can i use low molecular weight heparin in ...
2,3,originalQ-shortRandQ,False,have there been any studies with low molec...,what are the side effects of florinef co...
3,4,originalQ-shortQ,True,let s give these immunizations that s ri...,let s give these immunizations that s ri...
4,5,originalQ-shortRandQ,False,let s give these immunizations that s ri...,is there more support we can provide patie...


In [29]:
# Binarize outcome variable
y = pd.DataFrame()
y['outcome'] = np.where(new_data['value'] == 'true', 1, 0)
y.head()

Unnamed: 0,outcome
0,1
1,1
2,0
3,1
4,0


In [0]:
# Divide train data into train and test (validation)
# 80-20 split
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(new_data, y, test_size=0.2, random_state=42)

In [31]:
x_train.shape

(6870, 5)

In [32]:
x_train['outcome'] = np.where(x_train['value'] == 'true', 1, 0)
x_test['outcome'] = np.where(x_test['value'] == 'true', 1, 0)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,pid,type,value,chq,faq,outcome
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1


In [33]:
# majority classifier - training
x_train['outcome'].value_counts()

1    3739
0    3131
Name: outcome, dtype: int64

In [34]:
print('Majority Classifier train accuracy:', 3739/(3739+3131))

Majority Classifier train accuracy: 0.5442503639010189


In [35]:
x_test.shape

(1718, 6)

In [36]:
# majority classifier
y_test['outcome'].value_counts()

1    916
0    802
Name: outcome, dtype: int64

In [37]:
print('Majority Classifier test accuracy:', 916/(916+802))

Majority Classifier test accuracy: 0.5331781140861467


# Rule 1

Rule #1: Find words that exist in both the chq and faq. If this number is greater than certain threshold, then predict 1. Else, predict 0.

In [38]:
x_train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1


In [39]:
x_train['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in x_train.values]
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{a, with, influenza}"
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to}
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{old, previous, with, negative, protein, deriv..."
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do}
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{with, in, ischemic, patient, no, i, do, echoc..."


In [40]:
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{a, with, influenza}",3
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to},1
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{old, previous, with, negative, protein, deriv...",20
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do},1
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{with, in, ischemic, patient, no, i, do, echoc...",16


In [41]:
x_train['exists_both_len'].describe()

count    6870.000000
mean        5.736827
std         5.117859
min         0.000000
25%         1.000000
50%         4.500000
75%        10.000000
max        30.000000
Name: exists_both_len, dtype: float64

In [42]:
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 3, 1, 0)
x_train['exist_feature'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


1    3738
0    3132
Name: exist_feature, dtype: int64

In [43]:
train_pred = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred.shape

(6451, 9)

In [44]:
train_pred.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to},1,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{old, previous, with, negative, protein, deriv...",20,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do},1,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{with, in, ischemic, patient, no, i, do, echoc...",16,1
2303,2304,originalQ-shortQ,True,what is the reference for the article on a...,what is the reference to the article on ar...,1,"{technique, what, the, is, article, on, refere...",8,1


In [45]:
incorrect_train_pred = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred.shape

(419, 9)

In [46]:
incorrect_train_pred.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{a, with, influenza}",3,0
156,157,originalQ-shortRandQ,False,how soon should you ambulate a patient wit...,can i use low molecular weight heparin in ...,0,"{with, thrombosis, patient, vein, deep}",5,1
2057,2058,originalQ-shortRandQ,False,what are the causes of and how do you work...,what is the incubation period of influenza...,0,"{a, the, of, what}",4,1
6885,6886,originalQ-shortRandQ,False,year old woman complains that her heart...,how should i treat polymenorrhea in a y...,0,"{old, in, i, year, a}",5,1
6400,6401,originalQ-shortRandQ,False,i had a guy year old man with subclav...,what is the cause and treatment of this ol...,0,"{old, the, man, and}",4,1


In [47]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match > 3 Classifier Train Accuracy:',6451/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match > 3 Classifier Train Accuracy: 0.9390101892285299


In [48]:
# Test classifier
x_test['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 3, 1, 0)
pred = x_test[x_test['exist_feature'] == x_test['outcome']]
pred.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(1609, 9)

In [49]:
incorrect_pred = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred.shape

(109, 9)

In [50]:
x_test['outcome'].value_counts()

1    916
0    802
Name: outcome, dtype: int64

In [51]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match > 3 Classifier Test Accuracy:',1609/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match > 3 Classifier Test Accuracy: 0.9365541327124564


Accuracy improved from ~53% -> ~93% by applying the rule:

(number of exact words matching > 3) == 1, else 0

# Rule 2

Let's look at our incorrect predictions to then improve our classifier with our next rule.

In [52]:
incorrect_train_pred.head(20)

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{a, with, influenza}",3,0
156,157,originalQ-shortRandQ,False,how soon should you ambulate a patient wit...,can i use low molecular weight heparin in ...,0,"{with, thrombosis, patient, vein, deep}",5,1
2057,2058,originalQ-shortRandQ,False,what are the causes of and how do you work...,what is the incubation period of influenza...,0,"{a, the, of, what}",4,1
6885,6886,originalQ-shortRandQ,False,year old woman complains that her heart...,how should i treat polymenorrhea in a y...,0,"{old, in, i, year, a}",5,1
6400,6401,originalQ-shortRandQ,False,i had a guy year old man with subclav...,what is the cause and treatment of this ol...,0,"{old, the, man, and}",4,1
333,334,originalQ-shortRandQ,False,it s not crystal clear what s going on wit...,how should i treat polymenorrhea in a y...,0,"{old, in, i, year, a, treat}",6,1
752,753,originalQ-shortRandQ,False,what is the significance of haemophilus ae...,what is the dose of sporanox,0,"{the, is, what, of}",4,1
351,352,originalQ-shortQ,True,what is legatrin,what is legatrin,1,"{is, what, legatrin}",3,0
2996,2997,originalQ-shortRandQ,False,what is the upper limit of normal of small...,what is that new drug like prilosec it h...,0,"{the, is, of, what}",4,1
1412,1413,originalQ-shortRandQ,False,is keflex the drug of choice for this pati...,is serzone okay to give to with a partial ...,0,"{a, with, to, is}",4,1


In [53]:
incorrect_train_pred['outcome'].value_counts()

1    210
0    209
Name: outcome, dtype: int64

It seems that a lot of our incorrect predictions are due to the fact that non-relevant words are being matched. 

We can try to improve this by removing stopwords and changing our threshold to be > 1 exact match.

In [54]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize


def tokenize_and_remove_stopwords(text):
  # Tokenize
  tokenized_text = word_tokenize(text)

  # Remove stopwords
  for word in tokenized_text:
      if word in stopwords.words('english'):
          tokenized_text.remove(word)
  return tokenized_text

x_train['chq'] = x_train.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
x_train['faq'] = x_train.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

x_train.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,"[year, old, woman, presented, yesterday, a, pe...","[a, patient, influenza, symptoms, treated, the...",1,"{a, with, influenza}",3,0
4205,4206,originalQ-shortRandQ,False,"[history, pulmonary, embolus, a, contraindicat...","[we, switch, ancef, this, diabetic, foot, ulce...",0,{to},1,0
3889,3890,originalQ-shortQ,True,"[you, treat, year, old, a, positive, ppd, puri...","[you, treat, year, old, a, positive, ppd, puri...",1,"{old, previous, with, negative, protein, deriv...",20,1
2357,2358,originalQ-shortRandQ,False,"[we, need, worry, thrombocytopenia, other, sid...","[do, inject, bicipital, tendon]",0,{do},1,0
58,59,originalQ-shortQ,True,"[this, patient, tias, transient, ischemic, att...","[this, patient, transient, ischemic, attacks, ...",1,"{with, in, ischemic, patient, no, i, do, echoc...",16,1


In [55]:
# Re-calculate common text feature
x_train['exists_both'] = [set(x[3]) & set(x[4]) for x in x_train.values]
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 1, 1, 0)

train_pred_2 = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred_2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(6582, 9)

In [56]:
incorrect_train_pred_2 = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred_2.shape

(288, 9)

In [57]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match w/no stopwords > 1 Classifier Train Accuracy:',6582/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match w/no stopwords > 1 Classifier Train Accuracy: 0.9580786026200874


In [58]:
# clean test data
x_test['chq'] = x_test.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
x_test['faq'] = x_test.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

# Test classifier
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 1, 1, 0)
pred_2 = x_test[x_test['exist_feature'] == x_test['outcome']]
pred_2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

(1648, 9)

In [59]:
incorrect_pred_2 = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred_2.shape

(70, 9)

In [60]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match w/no stopwords > 1 Classifier Test Accuracy:',1648/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match w/no stopwords > 1 Classifier Test Accuracy: 0.959254947613504


Rule #2 (removing stopwords,tokenizing, and setting threshold to 1 exact match) further improved our test accuracy from ~93% -> ~95%

Look at incorrect predictions to further improve rules

In [61]:
incorrect_train_pred_2.head(20)

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
96,97,originalQ-shortRandQ,False,"[do, do, a, lead, level, in, month, old, is, b...","[should, treat, polymenorrhea, a, year, old, g...",0,"{old, a}",2,1
156,157,originalQ-shortRandQ,False,"[soon, you, ambulate, patient, a, deep, vein, ...","[i, use, low, molecular, weight, heparin, preg...",0,"{deep, thrombosis, patient, vein}",4,1
6885,6886,originalQ-shortRandQ,False,"[year, old, woman, complains, heart, feels, li...","[should, treat, polymenorrhea, a, year, old, g...",0,"{old, year}",2,1
6400,6401,originalQ-shortRandQ,False,"[guy, year, old, man, subclavian, steal, syndr...","[is, cause, treatment, this, old, man, stomati...",0,"{old, man}",2,1
8230,8231,originalQ-shortRandQ,False,"[is, differential, diagnosis, a, patient, cons...","[wonder, this, patient, could, a, rotator, cuf...",0,"{a, patient}",2,1
6395,6396,originalQ-shortQ,True,"[have, home, problem, children, one, the, kids...","[is, treatment, a, human, bite]",1,{human},1,0
333,334,originalQ-shortRandQ,False,"[s, crystal, clear, s, going, with, may, treat...","[should, treat, polymenorrhea, a, year, old, g...",0,"{old, treat, a, year}",4,1
7288,7289,originalQ-shortQ,True,"[month, old, intoeing, had, look, intoeing, we...","[is, approach, intoeing, children]",1,{intoeing},1,0
378,379,originalQ-shortRandQ,False,"[is, dose, imipramine, a, year, old, boy]","[should, treat, polymenorrhea, a, year, old, g...",0,"{old, a, year}",3,1
6715,6716,originalQ-shortQ,True,"[year, old, woman, complaining, excess, sweati...","[is, sweating]",1,{sweating},1,0


In [62]:
incorrect_train_pred_2['outcome'].value_counts()

0    183
1    105
Name: outcome, dtype: int64

In [63]:
incorrect_train_pred_2.exists_both = incorrect_train_pred_2.exists_both.apply(list)
incorrect_train_pred_2.exists_both = incorrect_train_pred_2.exists_both.apply(', '.join)
incorrect_train_pred_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
96,97,originalQ-shortRandQ,False,"[do, do, a, lead, level, in, month, old, is, b...","[should, treat, polymenorrhea, a, year, old, g...",0,"old, a",2,1
156,157,originalQ-shortRandQ,False,"[soon, you, ambulate, patient, a, deep, vein, ...","[i, use, low, molecular, weight, heparin, preg...",0,"deep, thrombosis, patient, vein",4,1
6885,6886,originalQ-shortRandQ,False,"[year, old, woman, complains, heart, feels, li...","[should, treat, polymenorrhea, a, year, old, g...",0,"old, year",2,1
6400,6401,originalQ-shortRandQ,False,"[guy, year, old, man, subclavian, steal, syndr...","[is, cause, treatment, this, old, man, stomati...",0,"old, man",2,1
8230,8231,originalQ-shortRandQ,False,"[is, differential, diagnosis, a, patient, cons...","[wonder, this, patient, could, a, rotator, cuf...",0,"a, patient",2,1


Now it seems we are hitting a point where the sentences are extremely similar, so we need to develop a more nuanced rule.

Most of our incorrect predictions for this classifier were false positives (predicted 1 but is actually 0), so we'll build our next rule to guard against that.

There seems to be words that are frequently found in both chq and faq, but don't have anything to do with the meaning (ex. a, old). Let's build a brief dictionary of these terms and see if we can exclude that in our classifier.


In [64]:
from collections import Counter
Counter(incorrect_train_pred_2.exists_both).most_common(100)

[('old, year', 40),
 ('old, a, year', 18),
 ('', 13),
 ('a, patient', 8),
 ('old, a', 6),
 ('is, dose', 6),
 ('old, girl, year', 6),
 ('patient, i', 5),
 ('use, i', 4),
 ('is, patient', 4),
 ('old, man', 3),
 ('a', 3),
 ('a, should', 3),
 ('the', 3),
 ('old, treat, a, year', 2),
 ('this, patient', 2),
 ('rash', 2),
 ('a, treat', 2),
 ('enuresis', 2),
 ('is, good', 2),
 ('herpes', 2),
 ('treat, the', 2),
 ('is', 2),
 ('t, s', 2),
 ('thyroid', 2),
 ('old, should, year', 2),
 ('old, girl', 2),
 ('a, weeks', 2),
 ('the, is', 2),
 ('did, the', 2),
 ('woman', 2),
 ('prostate', 2),
 ('a, do, patient', 2),
 ('a, girl', 2),
 ('immunizations', 2),
 ('dose', 2),
 ('deep, thrombosis, patient, vein', 1),
 ('human', 1),
 ('intoeing', 1),
 ('sweating', 1),
 ('acne', 1),
 ('test, you', 1),
 ('a, test', 1),
 ('old, is', 1),
 ('this, i', 1),
 ('do', 1),
 ('barrett', 1),
 ('hyperthyroidism', 1),
 ('liver', 1),
 ('stachybotrys', 1),
 ('hours', 1),
 ('zantac', 1),
 ('hemoglobinopathy', 1),
 ('give, s', 1),

In [0]:
nonrel_words = ['year', 'old', 'a', 'years', 'patient', 'girl', 'i', 'the']

In [66]:
# Re-calculate common text feature
x_train['exists_both'] = [set(x[3]) & set(x[4]) for x in x_train.values]
# iterate over the dataframe row by row
for index_label, row_series in x_train.iterrows():
   # For each row update the exists_both variable
   for word in nonrel_words:
    if word in x_train.at[index_label, 'exists_both']:
      x_train.at[index_label , 'exists_both'].remove(word)
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 1, 1, 0)

train_pred_3 = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred_3.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


(6707, 9)

In [67]:
incorrect_train_pred_3 = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred_3.shape

(163, 9)

In [68]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match w/no stopwords or nonrelevant words > 1 Classifier Train Accuracy:',6707/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match w/no stopwords or nonrelevant words > 1 Classifier Train Accuracy: 0.97627365356623


In [69]:
# Test classifier
# Re-calculate common text feature
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
# iterate over the dataframe row by row
for index_label, row_series in x_test.iterrows():
   # For each row update the exists_both variable
   for word in nonrel_words:
    if word in x_test.at[index_label, 'exists_both']:
      x_test.at[index_label , 'exists_both'].remove(word)
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 1, 1, 0)
pred_3 = x_test[x_test['exist_feature'] == x_test['outcome']]
pred_3.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the 

(1648, 9)

In [70]:
incorrect_pred_3 = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred_3.shape

(70, 9)

In [71]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match w/no stopwords or nonrelevant words > 1 Classifier Test Accuracy:',1648/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match w/no stopwords or nonrelevant words > 1 Classifier Test Accuracy: 0.959254947613504


Train accuracy improved from ~95% -> ~97%, but test accuracy was exact same. 

In [72]:
incorrect_train_pred_3

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,true,"[year, old, woman, presented, yesterday, a, pe...","[a, patient, influenza, symptoms, treated, the...",1,{influenza},1,0
156,157,originalQ-shortRandQ,false,"[soon, you, ambulate, patient, a, deep, vein, ...","[i, use, low, molecular, weight, heparin, preg...",0,"{deep, thrombosis, vein}",3,1
6395,6396,originalQ-shortQ,true,"[have, home, problem, children, one, the, kids...","[is, treatment, a, human, bite]",1,{human},1,0
7288,7289,originalQ-shortQ,true,"[month, old, intoeing, had, look, intoeing, we...","[is, approach, intoeing, children]",1,{intoeing},1,0
6715,6716,originalQ-shortQ,true,"[year, old, woman, complaining, excess, sweati...","[is, sweating]",1,{sweating},1,0
...,...,...,...,...,...,...,...,...,...
4548,4549,originalQ-shortRandQ,false,"[are, indications, growth, hormone, a, small, ...","[do, use, redux, are, indications]",0,"{indications, are}",2,1
6546,6547,originalQ-shortQ,true,"[year, old, man, rash, face, thought, secondar...","[would, approach, apparent, photodermatitis, n...",1,{},0,0
6528,6529,originalQ-shortQ,true,"[year, old, exposed, chickenpox, had, first, i...","[an, adult, one, dose, chickenpox, vaccine, ar...",1,{chickenpox},1,0
7385,7386,originalQ-shortQ,true,"[year, old, woman, multiple, antibiotic, aller...","[quinolones, effective, streptococcal, sore, t...",1,{quinolones},1,0


## Notes from observing final incorrect predictions:

- Rules based methods like this will always miss out on cases where there are no words in common, but diseases/health concepts are similar (6547, 6296). This can be attempted to be resolved through word embeddings that give us vector representations of the similarity between medical concepts.

- Can use n-gram representations with the same rule to pick up on other similarities that exist. Often times,even after removing a word, just having one word in common is not restrictive enough to identify true positives. 

- Related to the first point, often times we are missing cases where the only word in common is the health-related term, so doing a version of NER to pick up these terms and give more importance/weight to them could be worth doing.

# Test rules-based classifiers on actual MedACL RQE Task

In [73]:
# performance on test set
train = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
val = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Test_302_pairs_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
test = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/MEDIQA2019-Task2-RQE-TestSet-wLabels.xml', ['pid', 'type', 'value', 'chq', 'faq'])

train['chq'] = train.apply(lambda x: preprocess_text(x['chq']), axis=1)
train['faq'] = train.apply(lambda x: preprocess_text(x['faq']), axis=1)

val['chq'] = val.apply(lambda x: preprocess_text(x['chq']), axis=1)
val['faq'] = val.apply(lambda x: preprocess_text(x['faq']), axis=1)

test['chq'] = test.apply(lambda x: preprocess_text(x['chq']), axis=1)
test['faq'] = test.apply(lambda x: preprocess_text(x['faq']), axis=1)

train['outcome'] = np.where(train['value'] == 'true', 1, 0)
val['outcome'] = np.where(val['value'] == 'true', 1, 0)
test['outcome'] = np.where(test['value'] == 'true', 1, 0)

train['outcome'].value_counts()

1    4655
0    3933
Name: outcome, dtype: int64

In [74]:
print('Majority Classifier train accuracy:', 4655/(3933+4655))

Majority Classifier train accuracy: 0.5420353982300885


In [75]:
# Train classifier performance
train['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in train.values]
train['exists_both_len'] = train['exists_both'].apply(len)
train['exist_feature'] = np.where(train['exists_both_len'] > 3, 1, 0)
pred_train = train[train['exist_feature'] == train['outcome']]
pred_train.shape

(8060, 9)

In [76]:
print('Rule 1 Classifier train accuracy:', 8060/(3933+4655))

Rule 1 Classifier train accuracy: 0.9385188635305077


In [77]:
val['outcome'].value_counts()

0    173
1    129
Name: outcome, dtype: int64

In [78]:
print('Majority Classifier val accuracy:', 173/(173+129))

Majority Classifier val accuracy: 0.5728476821192053


In [79]:
test['outcome'].value_counts()

1    115
0    115
Name: outcome, dtype: int64

In [80]:
print('Majority Classifier test accuracy:', 115/(115+115))

Majority Classifier test accuracy: 0.5


In [81]:
# Val classifier performance
val['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in val.values]
val['exists_both_len'] = val['exists_both'].apply(len)
val['exist_feature'] = np.where(val['exists_both_len'] > 3, 1, 0)
pred_val= val[val['exist_feature'] == val['outcome']]
pred_val.shape

(203, 9)

In [82]:
print('Rule 1 Classifier val accuracy:', 182/(173+129))

Rule 1 Classifier val accuracy: 0.6026490066225165


In [83]:
# Test classifier performance
test['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in test.values]
test['exists_both_len'] = test['exists_both'].apply(len)
test['exist_feature'] = np.where(test['exists_both_len'] > 3, 1, 0)
pred_test= test[test['exist_feature'] == test['outcome']]
pred_test.shape

(122, 9)

In [84]:
print('Rule 1 Classifier test accuracy:', 120/(115+115))

Rule 1 Classifier test accuracy: 0.5217391304347826


In [85]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize


def tokenize_and_remove_stopwords(text):
  # Tokenize
  tokenized_text = word_tokenize(text)

  # Remove stopwords
  for word in tokenized_text:
      if word in stopwords.words('english'):
          tokenized_text.remove(word)
  return tokenized_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
# Train classifier performance
# clean test data
train['chq'] = train.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
train['faq'] = train.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

# Test classifier
train['exists_both'] = [set(x[3]) & set(x[4]) for x in train.values]
train['exists_both_len'] = train['exists_both'].apply(len)
train['exist_feature'] = np.where(train['exists_both_len'] > 1, 1, 0)
pred_train_2 = train[train['exist_feature'] == train['outcome']]
pred_train_2.shape

(8230, 9)

In [87]:
print('Rule 2 Classifier train accuracy:', 8230/(3933+4655))

Rule 2 Classifier train accuracy: 0.9583139264089428


In [88]:
# Val classifier performance
# clean test data
val['chq'] = val.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
val['faq'] = val.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

# Test classifier
val['exists_both'] = [set(x[3]) & set(x[4]) for x in val.values]
val['exists_both_len'] = val['exists_both'].apply(len)
val['exist_feature'] = np.where(val['exists_both_len'] > 1, 1, 0)
pred_val_2 = val[val['exist_feature'] == val['outcome']]
pred_val_2.shape

(224, 9)

In [89]:
print('Rule 2 Classifier val accuracy:', 228/(173+129))

Rule 2 Classifier val accuracy: 0.7549668874172185


In [90]:
# Test classifier performance
# clean test data
test['chq'] = test.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
test['faq'] = test.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

# Test classifier
test['exists_both'] = [set(x[3]) & set(x[4]) for x in test.values]
test['exists_both_len'] = test['exists_both'].apply(len)
test['exist_feature'] = np.where(test['exists_both_len'] > 1, 1, 0)
pred_test_2 = test[test['exist_feature'] == test['outcome']]
pred_test_2.shape

(107, 9)

In [91]:
print('Rule 2 Classifier test accuracy:', 114/(115+115))

Rule 2 Classifier test accuracy: 0.4956521739130435


In [0]:
# output train and test csv's
# train.to_csv('rqe_train.csv')
# test.to_csv('rqe_test.csv')

In [0]:
# output original train and test 
# train = parse_XML('/content/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
# test = parse_XML('/content/RQE_Test_302_pairs_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])

# train.to_csv('rqe_train_orig.csv')
# test.to_csv('rqe_test_orig.csv')