### **CoreNLP SETUP**

#### Download (https://stanfordnlp.github.io/CoreNLP/) and run Stanford CoreNLP server: 
```cd stanford-corenlp-full-2018-02-27```   
```java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000```

#### Optionally, if you want to load all "annotators":
```java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -timeout 15000```

#### If successful, you will see:
```[main] INFO CoreNLP - StanfordCoreNLPServer listening at /0:0:0:0:0:0:0:0:9000```

#### To stop the server:  
#### https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#stopping-the-server

### **LIBRARIES**

In [26]:
import os               # os.path.join
import pandas as pd

import nltk             #for POS tagging
from nltk.tree import *

from pycorenlp import StanfordCoreNLP
parser = StanfordCoreNLP('http://localhost:9000')

### **DATA PATHS (DOCUMENTS and QUERIES)**

In [27]:
DATA_DIR = "data/"

DATA_DOC = os.path.join(DATA_DIR, "data_doc.csv")
DATA_QUERY_TRAIN = os.path.join(DATA_DIR, "data_query_train.csv")
DATA_QUERY_VALIDATE_TEST = os.path.join(DATA_DIR, "data_query_validate_test.csv")

FEATURE_DOC = os.path.join(DATA_DIR, "feature_doc.csv")
FEATURE_QUERY_TRAIN = os.path.join(DATA_DIR, "feature_query_train.csv")
FEATURE_QUERY_VALIDATE_TEST = os.path.join(DATA_DIR, "feature_query_validate_test.csv")


### **HELPER FUNCTIONS**

In [28]:
def csv_to_df(filepath):
    """
    argument: csv file path to read from
    return: pandas dataframe
    """
    df1 = pd.read_csv(filepath)
    return df1

def df_to_csv(df, filepath):
    """
    arguments: pandas dataframe, csv file path to write to
    return: csv file
    """
    df.to_csv(filepath, encoding='utf-8', index=False)

In [29]:
def convert_list_tuple_to_string(x):
    """
    argument: list or tuple of words
    return: string of words, separated by whitespace
    """
    s = ''
    for w in x:
        if s == '':
            s = w
        else:
            s = s + ' ' + w
    return s

In [30]:
def concat_string_with_underscore(s):
    """
    argument: string of words separated by whitespace
    return: words concatenated with the underscore character
    """
    t = s.split()
    s1 = ''
    for w in t:
        if s1 == '':
            s1 = w
        else:
            s1 = s1 + '_' + w
    return s1        

### **PARSE TREE**

In [31]:
def parse_tree(s):
    """
    argument: string
    return: parse tree from Tree.fromstring()
    """
    output = parser.annotate( s, properties={'annotators': 'parse', 'outputFormat': 'json', 'timeout': 1000,} )
    if s:
        t = output['sentences'][0]['parse']
    else:
        t = ''
    if t:
        return Tree.fromstring(t)
    else:
        return ''

In [32]:
def parse_tree_productions(s):
    """
    arguments: question, answer (strings)
    return: a string of all productions (excluding the root and leaf nodes) in constituency parse tree of question, 
    POS tags in each production concatenated by underscore character
    """
    t1 = parse_tree(s)
    if t1:
        t2 = t1.productions()
    else:
        return ''
    
    t3 = []
    productions = ''
    
    for x in t2:
        if "'" in str(x) or "ROOT" in str(x):  # leave out productions with root and leaf nodes
            continue
        else:
            t3.append(x)
    
    for p in t3:
        p1 = str(p).replace(" ->", "")
        p2 = p1.replace(" ", "_")
        if productions == '':
            productions = p2
        else:
            productions = productions + ' ' + p2
    return productions

### **PARSE TREE PRINTING**

In [33]:
def print_parse_tree(s):
    """
    arguments: s (string)
    return: the constituency parse tree
    """
    t = parse_tree(s)    
    t.pretty_print()

In [34]:
def print_parse_tree_with_answer(q,a):
    """
    arguments: question, answer (strings)
    return: the constituency parse tree
    call: print_parse_tree()
    """
    q = q.replace("*", a)
    return print_parse_tree(q)

### **POS TAGS**

In [35]:
def words_to_pos_tags(s):
    """
    argument: string
    return: POS tags (string) separated by whitespace
    """
    t = s.split()               
    word_tag = nltk.pos_tag(t)  # word_tag = [('He', 'PRP'), ...] 
    
    tags = ''
    for p in word_tag:          
        tag = p[1]              # tag = 'PRP'
        if tags == '':
            tags = tag
        else:
            tags = tags + ' ' + tag
    return tags    

### **NGRAMS**

In [36]:
def ngram_list_to_string(ngram_list):
    """
    argument: list of bigrams or trigrams of words
    return: string of bigrams or trigrams separated by whitespace
    """
    s1 = ''
    for tup in ngram_list:
        s2 = convert_list_tuple_to_string(tup)
        s3 = concat_string_with_underscore(s2)
        if s1 == '':
            s1 = s3
        else:
            s1 = s1 + ' ' + s3
    return s1  

In [37]:
def string_to_bigrams(s):
    """
    argument: string of words
    return: string of bigrams of words, words concatenated by the underscore in each bigram
    """
    t = s.split()
    bigram_list = list(nltk.ngrams(t,2))  # list of tuples [('xxx','yyy'),...]
    return ngram_list_to_string(bigram_list)


def string_to_trigrams(s):
    """
    argument: string of words
    return: string of trigrams of words, words concatenated by the underscore in each trigram
    """
    t = s.split()
    trigram_list = list(nltk.ngrams(t,3))  # list of tuples [('xxx','yyy','zzz'),...]
    return ngram_list_to_string(trigram_list)

### **FIELD: SENTENCE (Model 1)**

In [38]:
def question_complete(q,a):
    """
    argument: question containing *, answer (strings)
    return: the complete question in which * is replaced by the answer
    """
    return q.replace("*", a)

### **FIELD: WORDS BEFORE AND AFTER ANSWER (Model 2)**

In [39]:
def n_words_before_answer(q,n):
    """
    argument: question (string) containing the * character
    return: previous_word(string)
        Two words to the left of the * character.
        If * is the second word in the sentence, return one word.
        If * is the first word in the sentence, return empty string.
    """
    string_before = q.split('*')[0]                # split into two strings
    word_list = string_before.strip().split(' ')[-n:]
    return convert_list_tuple_to_string(word_list)

In [40]:
# IMPROVE THIS CODE

def n_words_after_answer(q,n):
    """
    argument: question (string) containing the * character
    return: next_word(string)
        Two words to the right of the * character.
        If * is the second last word, return the last word, excluding punctuation.
        If * is the last word, return empty string.
    """
    if q.endswith('*') or q == '':
        return ''
    else:
        string_after = q.split('*')[1]               # split into two strings
    word_list = string_after.strip().split(' ')[0:n]
    w = convert_list_tuple_to_string(word_list)
    if len(w) != 0:
        last_char = w[len(w)-1]
    if last_char == '.' or last_char == '?' or last_char == '!' or last_char == ',' or last_char == ';':
        words = w[:-1]
    else:    
        words = w
    return words

### **FIELD: SUBSTRING (Model 2)**

In [41]:
def question_substring(q,a,n):
    """
    argument: question containing *, answer (strings), integer n (number of words before and after answer)
    return: the string 'ppp aaa xxx', 
        where 
        ppp are the n words before the answer
        aaa is the answer
        xxx are the n words after the answer
    """
    s1 = n_words_before_answer(q,n)
    s2 = n_words_after_answer(q,n)
    return (s1 + ' ' + a + ' ' + s2).strip()    # remove any white space at the start and end of string

### **FIELD: WORDS BEFORE ANSWER (Model 2)**

In [42]:
def last_word_of_string(s):
    """
    argument: string
    returns: last word in the string
    """
    if s:
        s1 = s.split()
        return s1[-1]
    else:
        return ''

### **FIELD: WORDS AFTER ANSWER (Model 2)**

In [43]:
def first_word_of_string(s):
    """
    argument: string
    returns: first word in the string
    """
    if s:
        s1 = s.split()
        return s1[0]
    else:
        return ''

### **FIELD: ANSWER (Model 2)**

In [44]:
def answer_is_at_beginning_doc(q):
    """
    argument: question (string) containing the * character
    return: boolean 
        1 if * is at the start of the question
        0 otherwise
    """
    t = q.split(' ',1)[0]        
    if t == '*':
        return 'b'
    else:
        return 'x'

def answer_is_at_end_doc(q):
    """
    argument: question(string) containing the * character
    return: boolean 
        1 if * is at the end of the question
        0 otherwise
    """
    t = q.split('*')[1]              
    if t == '.' or t == '?' or t == '!':
        return 'e'
    else:
        return 'x' 

In [45]:
def answer_is_at_beginning_query(q):
    """
    argument: question (string) containing the * character
    return: boolean 
        1 if * is at the start of the question
        0 otherwise
    """
    t = q.split(' ',1)[0]        
    if t == '*':
        return 'b'
    else:
        return 'y'
    
def answer_is_at_end_query(q):
    """
    argument: question(string) containing the * character
    return: boolean 
        1 if * is at the end of the question
        0 otherwise
    """
    t = q.split('*')[1]              
    if t == '.' or t == '?' or t == '!':
        return 'e'
    else:
        return 'y'    

In [46]:
def string_length(s):
    """
    argument: sentence (string)
    return: the number of words in the sentence (integer)
        * is counted as one word.
    """
    return len(s.split())

### **CREATE FEATURE DATAFRAMES**

In [47]:
# MODEL 1 FEATURES

def create_features_df_model1(df,doc):
    """
    argument: pandas dataframe
    return: pandas dataframe with features 
    """    
    for i, row in df.iterrows(): 
        q = row['qb_question']
        a = row['qb_answer']
        qa = question_complete(q,a)
        qa_tags = words_to_pos_tags(qa)
        
        df.loc[i,'qa'] = qa             # for baseline BM25 ranking in Model 1       
        df.loc[i,'qa_pos'] = qa_tags
        df.loc[i,'qa_pos_bigram'] = string_to_bigrams(qa_tags) 
        df.loc[i,'qa_pos_trigram'] = string_to_trigrams(qa_tags)
        df.loc[i,'qa_parse_tree'] = parse_tree_productions(qa)        
        
    return df    

In [48]:
# MODEL 2 FEATURES

def create_features_df_model2(df, data_type):
    """
    argument: pandas dataframe
    return: pandas dataframe with features 
    """ 
    n = 4
    for i, row in df.iterrows(): 
        q = row['qb_question']
        a = row['qb_answer']
        substr = question_substring(q,a,n)
        substr_tags = words_to_pos_tags(substr)
        
        # FIELD: substring
        df.loc[i,'ss'] = substr             # for baseline BM25 ranking in Model 2       
        df.loc[i,'ss_pos'] = substr_tags
        df.loc[i,'ss_pos_bigram'] = string_to_bigrams(substr_tags) 
        df.loc[i,'ss_pos_trigram'] = string_to_trigrams(substr_tags)
        df.loc[i,'ss_parse_tree'] = parse_tree_productions(substr)        
        
        # FIELD: words before answer
        before = n_words_before_answer(q,n)
        before_tags = words_to_pos_tags(before)
        last = last_word_of_string(before) 
        last_tag = words_to_pos_tags(last)

        df.loc[i,'before'] = before 
        df.loc[i,'before_last'] = last
        df.loc[i,'before_last_pos'] = last_tag
        df.loc[i,'before_pos'] = before_tags
        df.loc[i,'before_pos_bigram'] = string_to_bigrams(before_tags)
        df.loc[i,'before_pos_trigram'] = string_to_trigrams(before_tags)
        df.loc[i,'before_parse_tree'] = parse_tree_productions(before)
        
        # FIELD: words after answer
        after = n_words_after_answer(q,n)
        after_tags = words_to_pos_tags(after)
        first = first_word_of_string(after) 
        first_tag = words_to_pos_tags(first)

        df.loc[i,'after'] = after 
        df.loc[i,'after_first'] = first
        df.loc[i,'after_first_pos'] = first_tag
        df.loc[i,'after_pos'] = after_tags
        df.loc[i,'after_pos_bigram'] = string_to_bigrams(after_tags)
        df.loc[i,'after_pos_trigram'] = string_to_trigrams(after_tags)
        df.loc[i,'after_parse_tree'] = parse_tree_productions(after)
        
        # FIELD: answer        
        ans_tags = words_to_pos_tags(a)
        ans_first = first_word_of_string(a) 
        ans_last = last_word_of_string(a)

        df.loc[i,'ans'] = a
        df.loc[i,'ans_first'] = ans_first
        df.loc[i,'ans_last'] = ans_last
        df.loc[i,'ans_pos'] = concat_string_with_underscore(ans_tags)
        df.loc[i,'ans_first_pos'] = words_to_pos_tags(ans_first) 
        df.loc[i,'ans_last_pos'] = words_to_pos_tags(ans_last) 
        
        if data_type == 'doc':
            df.loc[i,'ans_is_first'] = answer_is_at_beginning_doc(q) 
            df.loc[i,'ans_is_last'] = answer_is_at_end_doc(q) 
        elif data_type == 'query':
            df.loc[i,'ans_is_first'] = answer_is_at_beginning_query(q)
            df.loc[i,'ans_is_last'] = answer_is_at_end_query(q)
        df.loc[i,'ans_length'] = string_length(a) 

    return df    

### **RUN!**

### **CONVERT CSV DATA TO DATAFRAME**

In [49]:
df_data_doc = csv_to_df(DATA_DOC)

df_data_query_train = csv_to_df(DATA_QUERY_TRAIN)
df_data_query_validate_test = csv_to_df(DATA_QUERY_VALIDATE_TEST)

### **EXTRACT FEATURES**

In [50]:
# DOCUMENT FEATURES

df_doc = create_features_df_model1(df_data_doc, 'doc')
df_features_doc = create_features_df_model2(df_doc, 'doc')

In [51]:
# QUERY FEATURES

df_query1 = create_features_df_model1(df_data_query_train, 'query')
df_features_query_train = create_features_df_model2(df_query1, 'query')

df_query2 = create_features_df_model1(df_data_query_validate_test, 'query')
df_features_query_validate_test = create_features_df_model2(df_query2, 'query')

### **CONVERT DATAFRAMES TO CSV FILES**

In [52]:
df_to_csv(df_features_doc, FEATURE_DOC)

df_to_csv(df_features_query_train, FEATURE_QUERY_TRAIN)
df_to_csv(df_features_query_validate_test, FEATURE_QUERY_VALIDATE_TEST)