# Question Answering Models

In [1]:
# Import Dependencies
import pickle
from collections import Counter
import numpy as np

# Pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# NLP
import gensim  # Word 2 Vec
from nltk.corpus import stopwords

In [2]:
data_path = "/Users/seanosier/data/Metis/Final/"

In [3]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
    data = the data you want to pickle (save)
    filename = file name where you want to save the data
    python_version = the python version where you will be opening the pickle file
    
    Out:
    Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
    filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
    Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [5]:
# Load Raw Data Sets
raw_training_set = pd.DataFrame.from_csv(data_path + "training_set.tsv", sep="\t", index_col=False)
raw_validation_set = pd.DataFrame.from_csv(data_path + "validation_set.tsv", sep="\t", index_col=False)

### Model Training

In [6]:
print(len(raw_training_set))
raw_training_set.head(1)

2500


Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100001,"When athletes begin to exercise, their heart r...",C,at the tissue level,at the organ level,at the system level,at the cellular level


#### Determing Default / Fallback Option

In [7]:
correct_answer_counter = Counter(raw_training_set["correctAnswer"])
correct_answer_counter

Counter({'A': 584, 'B': 672, 'C': 640, 'D': 604})

**"B" was most common in training set, however after tests of just picking A, B, C, and D, "A" proved to be the best default choice for the test set:**
1. All A's = 0.28125
2. All B's = 0.24125
3. All C's = 0.24750
4. All D's = 0.23000

#### "All of the Above" Exploration

In [8]:
# Top 10 "D" Answers
answer_D_counter = Counter(raw_training_set["answerD"])
sorted(answer_D_counter.items(), key=lambda a: a[1], reverse=True)[:10]

[('All of the above', 13),
 ('cell membrane', 6),
 ('decomposers', 5),
 ('transpiration', 4),
 ('temperature', 4),
 ('mechanical energy', 4),
 ('None of the above', 4),
 ('DNA', 4),
 ('carbon dioxide', 4),
 ('cells', 4)]

In [9]:
raw_training_set[(raw_training_set["answerD"] == "All of the above") | \
                 (raw_training_set["answerD"] == "both A and C")]

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
35,100036,The number and sequence of amino acids in a pr...,D,chemical properties,shape,size,All of the above
185,100186,Which of the following changes can alter the f...,D,physical,temperature,pressure,All of the above
489,100490,Carbon is important to living organisms becaus...,D,have a variety of bonds,bond with other carbon atoms,bond with many other elements,All of the above
527,100528,The __________ of an element is determined by ...,D,thermal conductivity,melting temperature,electrical conductivity,both A and C
780,100781,The 6 key elements found in living organisms a...,D,water,carbon dioxide,proteins,All of the above
811,100812,When communicating the results of a scientific...,A,Any observations made during the investigation.,An overview of the procedure.,Your feelings as to how the investigation went.,All of the above
1092,101093,You make an observation that you hear crickets...,D,There are more crickets in the summer than in ...,Crickets chirp more often when the temperature...,"Crickets mate in the summer, and the chirping ...",All of the above
1179,101180,Changes in population size in an ecosystem are...,D,number of births and deaths,number of producers,movement of organisms in and out,both A and C
1217,101218,Water can be seen in living organisms by _____...,D,squeezing the juice out of an orange,perspiration in humans,saliva in dogs,All of the above
1249,101250,You make an observation that you hear crickets...,B,placing one group in a dark room kept at 77 de...,placing one group in a lighted room kept at 60...,placing one group in a room kept at 80 degrees...,All of the above


**"All of the Above" type answers are disproportionately correct. Always choose if present**

**Simple / Rules-based Model includes only the two rules above:**
1. If "All of the Above" type answer present, pick "D"
2. Else, always pick "A"

**This yields ~28% accuracy**

#### Word2Vec Training

In [9]:
train = raw_training_set

In [11]:
# Create "docments" with each question and it's correct answer as part of Word2Vec corpus
train["correct_answer_text"] = train.apply(lambda x: x["answer" + x["correctAnswer"]], axis=1)
train["word2vec_text"] = train["question"] + " " + train["correct_answer_text"]
question_answer_combos = train["word2vec_text"].tolist()

In [13]:
# Load pre-scraped and cleaned Wikipedia articles from Quikipedia project (https://github.com/sosier/Quikipedia)
english_articles = load_pickle(data_path + "english_articles.pkl")
simple_articles = load_pickle(data_path + "simple_articles.pkl")
print(len(english_articles), len(simple_articles))

wiki_articles = english_articles + simple_articles
len(wiki_articles)

113298 113298


226596

In [15]:
# Load in scraped 8th Grade Glossary terms and definitions
terms = load_pickle(data_path + "terms.pkl")
definitions = load_pickle(data_path + "definitions.pkl")

# Combine each term with its definition
vocab = [term + " " + definition for term, definition in zip(terms, definitions)]
len(vocab)

#vocab = [clean_corpus_text(term) for term in vocab]
#vocab = [[word for word in term.lower().split() if word not in stop] for term in vocab]

4444

In [22]:
word2vec_corpus = question_answer_combos + list(wiki_articles) + vocab
len(word2vec_corpus)

233540

In [23]:
def clean_corpus_text(text):
    """
    In:
    text = Text item including potential html break tags, punctuation, extra spaces, etc.
    
    Out:
    text = Text cleaned to remove any html break tags, punctuation, extra spaces, etc.
    """
    text = text.replace("<br><br>", " ")
    text = text.replace("<br>", " ")
    text = text.replace("<br/>", " ")
    text = text.replace("<br />", " ")
    text = text.replace("?", "")
    text = text.replace(".", "")
    text = text.replace(",", "")
    text = text.replace("!", "")
    text = text.replace("#", "")
    text = text.replace("`", "")
    text = text.replace("~", "")
    text = text.replace("^", "")
    text = text.replace("&", " and ")
    text = text.replace("=", " ")
    text = text.replace("|", " ")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace("\"", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("{", "")
    text = text.replace("}", "")
    text = text.replace("[", "")
    text = text.replace("]", "")
    text = text.replace("<", "")
    text = text.replace(">", "")
    text = text.replace("*", "")
    text = text.replace(":", "")
    text = text.replace(";", "")
    text = text.replace("-", " ")
    text = text.replace("_", " ")
    text = text.replace("+", " ")
    text = text.replace("/", " ")
    text = text.replace("\\", " ")
    text = text.replace("  ", " ")
    
    return text

In [24]:
# Clean corpus
word2vec_corpus = [clean_corpus_text(text) for text in word2vec_corpus]

In [25]:
# Split coprus into words, and remove stop words
stop = stopwords.words('english')
stop = set(stop) - set("not")  # Keep "not" (could be meaningful as a signal to negate what is in the string)

word2vec_corpus = [[word for word in text.lower().split() if word not in stop] for text in word2vec_corpus]
print(len(word2vec_corpus))
print(word2vec_corpus[0])

233540
['athletes', 'begin', 'exercise', 'heart', 'rates', 'respiration', 'rates', 'increase', 'level', 'organization', 'human', 'body', 'coordinate', 'functions', 'system', 'level']


In [26]:
# Create Word2Vec model
word2vec_model = gensim.models.Word2Vec(word2vec_corpus, size=200, window=10, min_count=1, workers=4, sg=1)

In [27]:
len(word2vec_model.vocab)

2209225

In [32]:
# Word2Vec Model on Example Question (gets it right):

# A
print(word2vec_model.n_similarity(['cellular', 'structure', 'allows', 'nutrients', 'pass', 'cells'], 
                                  ['mitochondrion']))
# B
print(word2vec_model.n_similarity(['cellular', 'structure', 'allows', 'nutrients', 'pass', 'cells'], 
                                  ['nucleus']))

# C - Identified as best answer (CORRECT)
print(word2vec_model.n_similarity(['cellular', 'structure', 'allows', 'nutrients', 'pass', 'cells'], 
                                  ['membrane']))

# D
print(word2vec_model.n_similarity(['cellular', 'structure', 'allows', 'nutrients', 'pass', 'cells'], 
                                  ['chloroplast']))

0.610329061933
0.553954475971
0.762058073418
0.620843493864


In [30]:
# Word2Vec Model on Example Question (gets it wrong):

# A
print(word2vec_model.n_similarity(['example', 'describes', 'learned', 'behavior', 'dog'], 
                                  ['smelling', 'air', 'odors']))
# B - Identified as best answer (WRONG)
print(word2vec_model.n_similarity(['example', 'describes', 'learned', 'behavior', 'dog'], 
                                  ['barking', 'disturbed']))

# C - Correct answer
print(word2vec_model.n_similarity(['example', 'describes', 'learned', 'behavior', 'dog'], 
                                  ['sitting', 'command']))

# D
print(word2vec_model.n_similarity(['example', 'describes', 'learned', 'behavior', 'dog'], 
                                  ['digging', 'soil']))

0.266588257075
0.302021376916
0.284964574964
0.260607964934


### Creating Submission

In [33]:
print(len(raw_validation_set))
raw_validation_set.head(1)

8132


Unnamed: 0,id,question,answerA,answerB,answerC,answerD
0,102501,A meter is industry by developing a process of...,cool foods during shipping,yard,nervous system,birds of prey


In [35]:
def generate_submission_csv(raw_validation_set, predictions):
    """
    In:
    raw_validation_set = Raw test data set provided by Kaggle as a pandas dataframe
    predictions = List of predicted correct answers, one for each row in the raw_validation_set
    
    Out:
    submission = Pandas dataframe / .csv for submission to Kaggle for scoring
    """
    submission = raw_validation_set["id"]
    predictions = pd.DataFrame(predictions, columns=["correctAnswer"])
    submission = pd.concat([submission, predictions], axis=1)
    
    submission.to_csv(data_path + "submission.csv", index=False)
    
    return submission

### Simple / Rules-Based Model

*Guess D for All of the above, None of the above, etc.*

In [36]:
def check_for_all_of_the_above(D_answers):
    """
    In:
    D_answers = List of all option "D" answers
    
    Out:
    is_all_of_the_above_type_answer = List of True / False as to whether "D" is an "All of the above" type answer
    """
    is_all_of_the_above_type_answer = []
    
    for answer in D_answers:
        
        if answer[-1] == ".":
            answer = answer[:-1]
            
        if answer.lower() == "all of the above" or answer.lower() == "none of the above" or \
                answer.lower() == "both a and b" or answer.lower() == "both a and c" or \
                answer.lower() == "both b and c":
            is_all_of_the_above_type_answer.append(True)
        else:
            is_all_of_the_above_type_answer.append(False)
    
    return is_all_of_the_above_type_answer

In [37]:
def simple_model_predict(raw_validation_set):
    """
    In:
    raw_validation_set = Raw test data set provided by Kaggle as a pandas dataframe
    
    Out:
    predictions = List of predicted correct answers, one for each row in the raw_validation_set, 
        using the Simple Model
    """
    raw_validation_set["predictions"] = "A"  # Always pick A...
    
    # ...unless the there is an "All of the Above" type answer, then select D
    raw_validation_set["is_all_of_the_above_type_answer"] = check_for_all_of_the_above(raw_validation_set["answerD"])
    raw_validation_set["predictions"] = raw_validation_set.apply(
        lambda x: "D" if x["is_all_of_the_above_type_answer"] else x["predictions"],
        axis = 1)
    
    return raw_validation_set["predictions"].tolist()

In [38]:
raw_validation_set = pd.DataFrame.from_csv(data_path + "validation_set.tsv", sep="\t", index_col=False)
submission = generate_submission_csv(raw_validation_set, simple_model_predict(raw_validation_set))
submission.head()

Unnamed: 0,id,correctAnswer
0,102501,A
1,102502,A
2,102503,A
3,102504,A
4,102505,A


In [39]:
def clean_column_for_word2vec(df_column):
    """
    In:
    df_column = Raw text column from pandas DataFrame
    
    Out:
    df_column = Raw text column cleaned for finding Word2Vec similarity scores
    """
    stop = stopwords.words('english')
    stop = set(stop) - set("not")
    word2vec_word_list = word2vec_model.vocab
    
    df_column = df_column.apply(lambda x: x.replace("?", ""))
    df_column = df_column.apply(lambda x: x.replace(".", ""))
    df_column = df_column.apply(lambda x: x.replace("!", ""))
    df_column = df_column.apply(lambda x: x.replace(",", ""))
    df_column = df_column.apply(lambda x: x.lower())
    df_column = df_column.apply(lambda x: x.split())
    
    df_column = df_column.apply(lambda words: [word for word in words if word not in stop])
    df_column = df_column.apply(lambda words: [word for word in words if word in word2vec_word_list])
    
    return df_column

In [40]:
def word2vec_model_predict(raw_validation_set):
    """
    In:
    raw_validation_set = Raw test data set provided by Kaggle as a pandas dataframe
    
    Out:
    predictions = List of predicted correct answers, one for each row in the raw_validation_set, 
        using the Word2Vec Model
    """
    # Process the question for Word2Vec
    raw_validation_set["question"] = clean_column_for_word2vec(raw_validation_set["question"])
    
    # Store original answerD's because we will be processing each question for the Word2Vec
    raw_validation_set["answerD_original"] = raw_validation_set["answerD"]
    
    # Process each answer and then get its Word2Vec similarity with the question
    for letter in ["A", "B", "C", "D"]:
        raw_validation_set["answer" + letter] = clean_column_for_word2vec(raw_validation_set["answer" + letter])
        raw_validation_set["similarity_" + letter] = raw_validation_set.apply(
            lambda x: word2vec_model.n_similarity(x["question"], x["answer" + letter]) \
            if x["question"] != [] and x["answer" + letter] != [] else -np.inf, axis=1)
    
    # Predict the answer with the highest Word2Vec similarity score
    raw_validation_set["predictions"] = raw_validation_set[
        ["similarity_A", "similarity_B", "similarity_C", "similarity_D"]].idxmax(axis=1)
    raw_validation_set["predictions"] = raw_validation_set["predictions"].apply(
        lambda x: x[-1] if type(x) == type("str") else "A")
    
    # Check for any "All of the above" type answers, and select D in any such questions
    raw_validation_set["is_all_of_the_above_type_answer"] = check_for_all_of_the_above(
        raw_validation_set["answerD_original"])
    raw_validation_set["predictions"] = raw_validation_set.apply(
        lambda x: "D" if x["is_all_of_the_above_type_answer"] else x["predictions"],
        axis = 1)
    
    return raw_validation_set["predictions"].tolist()

In [41]:
raw_validation_set = pd.DataFrame.from_csv(data_path + "validation_set.tsv", sep="\t", index_col=False)
submission = generate_submission_csv(raw_validation_set, word2vec_model_predict(raw_validation_set))
submission.head()

Unnamed: 0,id,correctAnswer
0,102501,C
1,102502,D
2,102503,A
3,102504,B
4,102505,D
