In [1]:
### 
# DEPENDENCIES 
###
import itertools
import nltk
import re
import json
import pandas as pd
import numpy as np
# sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tree import Tree
# gensim
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
# plotting
from matplotlib import pyplot
from matplotlib.widgets import CheckButtons
from wordcloud import WordCloud
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook  
# stanford corenlp
from stanfordcorenlp import StanfordCoreNLP
### 
# GLOBALS 
###
TRAINING_DATA_DIR="./datasets/BioASQ-trainingDataset6b.json"
# TFIDF CONFIG
TOP_N=5
MIN_DF=0.01
MAX_DF=1.00
N_GRAMS=[(1,1),(2,2),(3,3),(1,3)]
BUILD_WORD_CLOUD=False
# Stanford core-nlp config
CORE_NLP_CLIENT = StanfordCoreNLP(r'/Users/jalexander/Projects/stanford-corenlp-full-2018-10-05')
### 
# FUNCTIONS 
###
def build_vector_model(questions):
    questions = [tokenize(q) for q in questions]
    model = Word2Vec(questions, min_count=0, sg=0)
    question_vectors = []
    for question in questions:
        question_vector = []
        for word in question:
            question_vector = model[word] if (len(question_vector) == 0) else np.add(question_vector, model[word])
        question_vectors.append(question_vector)
    return question_vectors        

def parse_questions_types(data):
    return zip(*[[json['body'], json['type']] for json in data['questions']])

def label_to_class(str_labels, label):
    return str_labels.index(label)

def build_tfidf_weights(sent_list, min_df, max_df, ngram, top_n=25):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=max_df, 
        min_df=min_df, 
        tokenizer=tokenize, 
        ngram_range=ngram
    )
    tfidf = tfidf_vectorizer.fit_transform(sent_list)
    terms = tfidf_vectorizer.get_feature_names()
    return top_mean_feats(tfidf, terms, top_n=top_n)

def build_word_cloud(tfidf_weights, output_file):
    # Initialize the word cloud
    wc = WordCloud(
        background_color="white",
        max_words=1000,
        width = 1024,
        height = 720,
    )
    wc.generate_from_frequencies(tfidf_weights)
    wc.to_file(output_file)

def json_to_df(json_file_path):
    with open(json_file_path, 'r') as f:
        return pd.DataFrame(json.load(f))

def top_mean_feats(Xtr, features, grp_ids=None, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_tfidf_feats(row, features, top_n):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    feats = {}
    topn_ids = np.argsort(row)[::-1][:top_n]
    for i in topn_ids:
        feats[features[i]] = row[i]

    return feats

def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    filtered_tokens = []
    for word, pos in nltk.pos_tag(word_tokenize(text)):
        if len(word) < 2:
            continue
        filtered_tokens.append(word.lower())        
    return filtered_tokens

def build_parse_tree(text):
    text = text.rstrip().split('. ')[-1]
    return CORE_NLP_CLIENT.parse(text).replace('\n','')


In [11]:
features = []
[questions, q_types] = parse_questions_types(json_to_df(TRAINING_DATA_DIR))
str_labels = list(np.unique(q_types))
c=0

for q in questions:
    tree = build_parse_tree(q) 
    c+=1
    if c==3:
        break
    print(re.sub(r' +', ' ', tree))
    feature_set = []        
    feature_set.append(1 if 'List' in tree else 0)
    feature_set.append(1 if 'Is' in tree else 0)
    feature_set.append(1 if 'WH' in tree else 0)
    feature_set.append(1 if '?' in tree else 0)            
    features.append(feature_set)

(ROOT (SQ (VBZ Is) (NP (NNP Hirschsprung) (NN disease)) (NP (NP (DT a) (JJ mendelian)) (CC or) (NP (DT a) (JJ multifactorial) (NN disorder))) (. ?)))
(ROOT (NP (NP (NP (NN List) (NN signaling) (NNS molecules)) (PRN (-LRB- -LRB-) (NP (NNS ligands)) (-RRB- -RRB-)) (SBAR (WHNP (WDT that)) (S (VP (VBP interact) (PP (IN with) (NP (DT the) (NN receptor) (NN EGFR))))))) (. ?)))


In [9]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.6/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

In [57]:
# Classification
y = [label_to_class(str_labels, q_typ    e) for q_type in q_types]
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.1)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print(classification_report(y_test, y_hat, target_names=np.unique(q_types)))

              precision    recall  f1-score   support

     factoid       0.44      0.93      0.60        67
        list       1.00      0.26      0.41        47
     summary       0.67      0.12      0.20        51
       yesno       0.92      0.98      0.95        61

   micro avg       0.62      0.62      0.62       226
   macro avg       0.76      0.57      0.54       226
weighted avg       0.74      0.62      0.56       226



In [12]:
# TFIDF DATA ANALYSIS
[questions, q_types] = parse_questions_types(json_to_df(TRAINING_DATA_DIR))
str_labels = list(np.unique(q_types))
print("Labels")
print(str_labels)
print()
y = [label_to_class(str_labels, q_type) for q_type in q_types]

for label in str_labels:
    label_questions = [questions[idx] for idx, q_type in enumerate(q_types) if q_type == label]
    count = len(label_questions)   
    average_question_length = np.average([len(q) for q in label_questions])
    tfidf_weights = []

    for ngram in N_GRAMS:
        weights = build_tfidf_weights(label_questions, MIN_DF, MAX_DF, ngram, TOP_N) 
        tfidf_weights.append(weights)
    
        if BUILD_WORD_CLOUD:
            build_word_cloud(weights, f"{label}_{ngram}_world_cloud.png")

    # Logging
    print(f"Label Analysis: {label}")        
    print(f"Count: {count}")
    print(f"Average question length: {average_question_length}")
    for idx, ngram in enumerate(N_GRAMS):
        print(f"TFIDF NGRAM={ngram}: {tfidf_weights[idx]}")
    print()

Labels
['factoid', 'list', 'summary', 'yesno']

Label Analysis: factoid
Count: 619
Average question length: 62.19386106623586
TFIDF NGRAM=(1, 1): {'the': 0.16654194013860546, 'is': 0.15025829977958918, 'of': 0.1338216003532015, 'which': 0.12809479531982085, 'what': 0.11914779216782946}
TFIDF NGRAM=(2, 2): {'is the': 0.15156092242943828, 'what is': 0.13299239392559714, 'which is': 0.07966706660730298, 'of the': 0.06791421706890893, 'in the': 0.0445911791973339}
TFIDF NGRAM=(3, 3): {'what is the': 0.16171374753014955, 'which is the': 0.10893336835082734, 'is used for': 0.030694668820678513, 'is associated with': 0.025095454518804256, 'which gene is': 0.022363120649383113}
TFIDF NGRAM=(1, 3): {'the': 0.12029039191838244, 'is': 0.10700374616311188, 'which': 0.09631260876232565, 'of': 0.0949517547717784, 'what': 0.08765256725121949}

Label Analysis: list
Count: 485
Average question length: 66.31958762886597
TFIDF NGRAM=(1, 1): {'the': 0.1442447772351292, 'which': 0.12510783941073375, 'are':

In [None]:
# MODEL BUILDING
[questions, q_types] = parse_questions_types(json_to_df(TRAINING_DATA_DIR))
str_labels = list(np.unique(q_types))
question_vectors = build_vector_model(questions)

# PCA
pca = PCA(n_components=4)
pca.fit(question_vectors)
pca_res = pca.transform(question_vectors)

# Plotting result
pyplot.cla(); pyplot.clf();
fig = pyplot.figure(1, figsize=(10, 10))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
colors = {
    'factoid': 'red', 
    'list': 'blue', 
    'summary': 'green', 
    'yesno': 'black'
}
indexes = {
    'factoid': [], 
    'list': [], 
    'summary': [], 
    'yesno': []
}
series = {
    'factoid': [], 
    'list': [], 
    'summary': [], 
    'yesno': []
}

def check_func(label):
    l_series = series[label]
    l_series.set_visible(not l_series.get_visible())

[indexes[label].append(idx) for idx, label in enumerate(q_types)]

for label in str_labels:
    idx = indexes[label]
    series[label] = ax.scatter(pca_res[idx, 0], pca_res[idx, 1], pca_res[idx, 2], color=colors[label], s=8, alpha=0.4)

rax = pyplot.axes([0.05, 0.4, 0.15, 0.15])
check = CheckButtons(rax, ('factoid', 'list', 'summary', 'yesno'), (True, True, True, True))        
[rec.set_facecolor(colors[label]) for label, rec in zip(str_labels, check.rectangles)]
check.on_clicked(check_func)
pyplot.show()

In [None]:
# MODEL TESTING
[questions, q_types] = parse_questions_types(json_to_df(TRAINING_DATA_DIR))
X = build_vector_model(questions)
str_labels = list(np.unique(q_types))
y = [label_to_class(str_labels, q_type) for q_type in q_types]

# PCA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

pca = PCA(n_components=4)
pca.fit(X_train)
X_train_tr = pca.transform(X_train)
X_test_tr = pca.transform(X_test)

# search for optimal SVM parameters using grid search with 3-fold cross validation
Cs = np.logspace(0, 4, 4)
gammas = np.logspace(0, 3, 4)
param_grid = {'C': Cs, 'kernel': ['linear','rbf'], 'gamma': gammas}
clf = GridSearchCV(estimator=SVC(), param_grid=param_grid)
clf.fit(X_train_tr, y_train)
y_hat = clf.predict(X_test_tr)
print(clf.best_estimator_)
print(classification_report(y_test, y_hat, target_names=np.unique(q_types)))

In [None]:
# KNN()
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X_train_tr, y_train)
y_hat = clf.predict(X_test_tr)
print(classification_report(y_test, y_hat, target_names=np.unique(q_types)))

In [None]:
# GaussianNB()
clf = GaussianNB()
clf.fit(X_train_tr, y_train)
y_hat = clf.predict(X_test_tr)
print(classification_report(y_test, y_hat, target_names=np.unique(q_types)))

In [None]:
# DecisionTree
clf = DecisionTreeClassifier()
clf.fit(X_train_tr, y_train)
y_hat = clf.predict(X_test_tr)
print(classification_report(y_test, y_hat, target_names=np.unique(q_types)))