# In this part, we look at using machine learning models

Broadly, machine learning models help by figuring out how to weight and combine different measures of the text we have. They can also be used to create better representations/measures of text to be used in later downstream models, as in text embeddings.

## Preprocessing from part 1

This code is unchanged from part 1, it loads in labeled question duplicates data.

In [None]:
quora_duplicates_train_filename = "data/train.csv"

import csv # for parsing data
import matplotlib.pyplot as plt # for plotting results
# so that graphs show up in the notebook
%matplotlib inline
import sklearn # Common machine learning functions
import keras # neural networks, used later

def read_question_input_file(filename):
    pairs_list = []
    labels_list = []
    with open(filename) as f:
        f.readline() # consume the csv top row which has column names
        for row in csv.reader(f):
            pairs_list.append((row[3], row[4])) # The two questions for each row
            labels_list.append(int(row[5])) # this is whether the questions were marked as duplicates
    return pairs_list, labels_list
            
question_pairs, question_labels = read_question_input_file(quora_duplicates_train_filename)

# Shuffle the input together to ensure random split between train and test
# If you forget to do this and your data is ordered, you'll often see much higher train than test accuracy
question_pairs, question_labels = sklearn.utils.shuffle(question_pairs, question_labels)

# Split into train and test data
# Will be used later, initially just working with train
TEST_PERCENT = 0.3
train_cutoff = int(len(question_pairs)*(1-TEST_PERCENT))
train_question_pairs = question_pairs[:train_cutoff]
train_question_labels = question_labels[:train_cutoff]

test_question_pairs = question_pairs[train_cutoff:]
test_question_labels = question_labels[train_cutoff:]


all_train_questions = []
all_test_questions = []
for pair in train_question_pairs:
    all_train_questions.extend([pair[0], pair[1]])
for pair in test_question_pairs:
    all_test_questions.extend([pair[0], pair[1]])

print("num question pairs: ", len(question_pairs))
print("num train question pairs: ", len(train_question_pairs))
print("num test question pairs: ", len(test_question_pairs))
assert(len(question_pairs) == len(question_labels))

import nltk
import string
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

def preprocess_question(question,
                        split_method="spaces", # either "spaces" or "tokenization"
                        use_lowercase=False,
                        stem_method=None, # None, "stemming" or "lemmatize"
                        use_remove_stopwords=False,
                        use_remove_punctuation=False,
                        verbose=False):
    """Takes as input a question text string, and produces a list of tokens.
    
    split_method: either "spaces" or "tokenization". Determines method used to split up string.
    use_lowercase: if True, will lower case all tokens
    stem_method: None, "stemming" or "lemmatize". Determines method used to find base word.
    use_remove_stopwords: If True, will remove stopwords from the question tokens
    use_remove_punctuation: If True, will remove all punctuation tokens
    verbose: If True, will print the results of each step.
    
    """
    
    def vprint(*args):
        if verbose:
            print(args)

    vprint("input question:", question)
    
    def tokenize(question):
        # Take the text and break it into words
        # Handles punctuation better than text.split()
        tokens = nltk.word_tokenize(question)
        vprint("tokenized:", tokens)
        return tokens
    
    def basic_split(question):
        tokens = question.split(' ')
        vprint("split:", tokens)
        return tokens

    def lowercase(tokens):
        tokens = [t.lower() for t in tokens] 
        return tokens
    
    def stem(tokens):
        # For non-acrynonyms
        tokens = [stemmer.stem(t) for t in tokens]
        vprint("stemmed:", tokens)
        return tokens
    
    def lemmatize(tokens):
        token_pos_pairs = nltk.pos_tag(tokens)
        vprint("part of speech tagged: ", token_pos_pairs)
        tokens = [lemmatizer.lemmatize(pair[0], get_wordnet_pos(pair[1])) for pair in token_pos_pairs]
        vprint("lemmatized:", tokens)
        return tokens
    
    def remove_stopwords(tokens):
        tokens = [t for t in tokens if t not in stopwords]
        vprint("stopwords removed:", tokens)
        return tokens
    
    def remove_punctuation(tokens):
        tokens = [t for t in tokens if t not in string.punctuation]
        vprint('punctuation removed', tokens)
        return tokens
    
    if split_method == "spaces":
        tokens = basic_split(question)
    else:
        tokens = tokenize(question)
    
    if use_lowercase:
        tokens = lowercase(tokens)
    
    if stem_method == "stemming":
        tokens = stem(tokens)
    elif stem_method == "lemmatize":
        tokens = lemmatize(tokens)
    
    if use_remove_stopwords:
        tokens = remove_stopwords(tokens)
        
    if use_remove_punctuation:
        tokens = remove_punctuation(tokens)
    
    return tokens

## Feature Extraction

In the functions below, we extract data about the questions for use in a machine learning model. This is mostly similar to part1, but instead of picking a single method of text treatement, we try all and let the model decide what's important.

In [None]:
class QuestionData:
    """This is some data that will be precomputed for each question so feature functions
       don't duplicate work."""
    def __init__(self, original_text):
        self.original_text = original_text
        self.basic_split = preprocess_question(self.original_text)
        self.tokenized = preprocess_question(self.original_text, split_method="tokenize")
        self.stemmed = preprocess_question(self.original_text,
                                           split_method="tokenize",
                                           stem_method="stemming")
        self.without_stopwords = preprocess_question(self.original_text,
                                                     split_method="tokenize",
                                                     use_remove_stopwords=True)

In [None]:
def add_word_length_features(X, q1_data, q2_data):
    """Various representations of the length of questions"""
    basic_diff = abs(len(q1_data.basic_split) - len(q2_data.basic_split))
    token_diff = abs(len(q1_data.tokenized) - len(q2_data.tokenized))
    max_len_basic = max(len(q1_data.basic_split), len(q2_data.basic_split))
    X.append(basic_diff)
    X.append(token_diff)
    X.append(max_len_basic)

In [None]:
def add_bag_of_words_features(X, q1_data, q2_data):
    basic_intersection = len(set(q1_data.basic_split).intersection(set(q2_data.basic_split)))
    basic_union = len(set(q1_data.basic_split).union(q2_data.basic_split))
    jaccard_basic = basic_intersection/basic_union
    
    token_intersection = len(set(q1_data.tokenized).intersection(set(q2_data.tokenized)))
    token_union = len(set(q1_data.tokenized).union(q2_data.tokenized))
    jaccard_token = token_intersection/token_union
    
    stemmed_intersection = len(set(q1_data.stemmed).intersection(set(q2_data.stemmed)))
    stemmed_union = len(set(q1_data.stemmed).union(q2_data.stemmed))
    jaccard_stemmed = stemmed_intersection/stemmed_union
    
    no_stopwords_intersection = len(set(q1_data.stemmed).intersection(set(q2_data.stemmed)))
    no_stopwords_union = len(set(q1_data.stemmed).union(q2_data.stemmed))
    jaccard_no_stopwords = no_stopwords_intersection/no_stopwords_union
    
    X.append(basic_intersection)
    X.append(basic_union)
    X.append(jaccard_basic)
    
    X.append(token_intersection)
    X.append(token_union)
    X.append(jaccard_token)
    
    X.append(stemmed_intersection)
    X.append(stemmed_union)
    X.append(jaccard_stemmed)
    
    X.append(no_stopwords_intersection)
    X.append(no_stopwords_union)
    X.append(jaccard_no_stopwords)

Additional features, such as n grams could be added as well, but for now, we'll only use these.

In [None]:
import multiprocessing
import time

def compute_single_basic(pair):
    q1 = pair[0]
    q2 = pair[1]
    this_X = []
    data1 = QuestionData(q1)
    data2 = QuestionData(q2)
           
    add_word_length_features(this_X, data1, data2) 
    add_bag_of_words_features(this_X, data1, data2)
    
    return this_X

def compute_X_train_test(train_questions, test_questions, single_func):
    """Returns X_train, X_test with computed features"""
    
    def compute_X(questions):
        X = []
        
        pool = multiprocessing.Pool()
        X = pool.map(single_func, questions)
        return X

    start_time = time.time()
    X_train = compute_X(train_questions)
    X_test = compute_X(test_questions)
    print("Computed features in", time.time() - start_time, "seconds")
    return (X_train, X_test)

In [None]:
def train_logistic_classifier(X, y, balance_weights=False):
    classifier = sklearn.linear_model.LogisticRegression(
        class_weight="balanced" if balance_weights else None)
    classifier.fit(X, y)
    return classifier

In [None]:
cutoff_train = 10000 # for faster iteration, start with only a small part of all data
cutoff_test = int(cutoff_train * TEST_PERCENT)
X_train, X_test = compute_X_train_test(train_question_pairs[:cutoff_train],
                                       test_question_pairs[:cutoff_test],
                                       single_func=compute_single_basic)

In [None]:
classifier = train_logistic_classifier(X_train, train_question_labels[:cutoff_train])

In [None]:
test_accuracy = classifier.score(X_test, test_question_labels[:cutoff_test])
train_accuracy = classifier.score(X_train, train_question_labels[:cutoff_train])

print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

In [None]:
# how much do our features help?
X_train_constant = [[1] for i in range(cutoff_train)]
X_test_constant = [[1] for i in range(cutoff_test)]

In [None]:
classifier_constant = train_logistic_classifier(X_train_constant, train_question_labels[:cutoff_train])

In [None]:
test_accuracy = classifier_constant.score(X_test_constant, test_question_labels[:cutoff_test])
train_accuracy = classifier_constant.score(X_train_constant, train_question_labels[:cutoff_train])

# only moderately better
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

In [None]:
def evaluate_classifier(classifier, eval_X, labels, name=""):
    
    if type(classifier) == keras.engine.training.Model:
        pred = classifier.predict(np.asarray(eval_X))
    else:
        pred = [p[1] for p in classifier.predict_proba(eval_X)]
    
    precision, recall, threshold = sklearn.metrics.precision_recall_curve(
                                        y_true=labels,
                                        probas_pred=pred)
    
    # f1_score is the harmonic average between precision and recall
    f1_score = [2 * (precision[i] * recall[i]) 
                / (precision[i] + recall[i]) for i in range(len(precision))]
    
    fig, plts = plt.subplots(1, 3, figsize=[12,4])
    fig.suptitle(name)
    fig.subplots_adjust(hspace=0.5)
    
    for subplot in plts:
        subplot.grid(True)
        subplot.set_ylim(ymin=0, ymax=1)
    
    plts[0].set(ylabel="Recall/Precision")
    plts[0].set(xlabel="Threshold")
    plts[0].grid(True)
    plts[0].plot(threshold, precision[:-1], 'b', label="precision")
    plts[0].plot(threshold, recall[:-1], 'r', label="recall")
    plts[0].legend()
    
    plts[1].set(ylabel="Precision")
    plts[1].set(xlabel="Recall")
    plts[1].grid(True)
    plts[1].plot(recall, precision)
    
    plts[2].set(ylabel="F1 Score")
    plts[2].set(xlabel="Threshold")
    plts[2].grid(True)
    plts[2].plot(threshold, f1_score[:-1])
    

In [None]:
evaluate_classifier(classifier, X_test, test_question_labels[:cutoff_test], "with features")
evaluate_classifier(classifier_constant, X_test_constant, test_question_labels[:cutoff_test], "without features")

## Word Embeddings

Text embeddings are an extremely popular technique for modern text based machine learning. First, we start by looking at the famous "king - man + woman = queen" result.

In [None]:
import numpy as np

In [None]:
# These are the 300-dimension GloVe word embeddings 
# Higher dimension embeddings generally give better results, but take more computation and memory
# If it crashes, use the 50-dimension vectors instead.
glove_embeddings = "data/glove.6B/glove.6B.300d.txt"

# Taken from https://fasttext.cc/docs/en/english-vectors.html
import io
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    # n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data

In [None]:
def nv(vector):
    # Function to l2 normalize vector
    return vector/np.linalg.norm(vector)

In [None]:
glove_dict = load_vectors(glove_embeddings) # This line takes awhile

In [None]:
composition = nv(glove_dict['king']) - nv(glove_dict['man']) + nv(glove_dict['woman'])

In [None]:
print('composition dot queen', np.dot(nv(composition), nv(glove_dict['queen'])))
print('composition dot woman', np.dot(nv(composition), nv(glove_dict['woman'])))
print('queen dot woman', np.dot(nv(glove_dict['woman']), nv(glove_dict['queen'])))
print('queen dot castle', np.dot(nv(glove_dict['castle']), nv(glove_dict['queen'])))
print('composition dot castle', np.dot(nv(glove_dict['castle']), nv(composition)))
print('woman dot castle', np.dot(nv(glove_dict['castle']), nv(glove_dict['woman'])))

### Using in a model

Next let's train a classifier using word embedding similarity as features.

In [None]:
def add_embedding_features(X, *q_datas):
    # will discard tokens not found in GloVe
    word_embeddings = [[glove_dict[t] for t in q_data.tokenized
                                if t in glove_dict] for q_data in q_datas]
    if any(len(word_embeds) == 0 for word_embeds in word_embeddings):
        X.append(0.0) # invalid
        return
    
    question_embeddings = [nv(np.average(word_embeds, axis=0)) for word_embeds in word_embeddings]
    assert(len(question_embeddings[0]) == 300)
    cos_sim = np.dot(*question_embeddings)
    X.append(cos_sim)

In [None]:
def compute_single_with_embeddings(pair):
    q1 = pair[0]
    q2 = pair[1]
    this_X = []
    data1 = QuestionData(q1)
    data2 = QuestionData(q2)

    #add_word_length_features(this_X, data1, data2) 
    #add_bag_of_words_features(this_X, data1, data2)
    add_embedding_features(this_X, data1, data2)

    return this_X

In [None]:
cutoff_train = 10000 # for faster iteration, start with only a small part of all data
cutoff_test = int(cutoff_train * TEST_PERCENT)
X_train_emb, X_test_emb = compute_X_train_test(train_question_pairs[:cutoff_train],
                                       test_question_pairs[:cutoff_test],
                                       single_func=compute_single_with_embeddings)

In [None]:
classifier_emb = train_logistic_classifier(X_train_emb,
                                           train_question_labels[:cutoff_train],
                                           balance_weights=True)

In [None]:
evaluate_classifier(classifier_emb, X_test_emb, test_question_labels[:cutoff_test], "with embedding similarity")

### Neural Networks

This is a pretty broad topic, won't be able to cover much today. Neural networks are very powerful classifiers that have been the focus of much of the recent developments in machine learning and NLP.

Here we look at only a simple neural network with a single hidden layer with 5 units.

In [None]:
import keras
from keras.layers.core import Dense
from keras.layers import Input
from keras.models import Model


def make_simple_dense_model(num_features):
    """This is a simple model with 5 hidden units."""
    features_input = Input(shape=(num_features,), dtype='float32')
    
    first_dense = Dense(5, activation='relu')(features_input)
    output = Dense(1, activation='sigmoid')(first_dense)
    
    model = Model(inputs=[features_input], outputs=[output])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = make_simple_dense_model(len(X_train[0]))

In [None]:
model.fit(np.asarray(X_train),
          np.asarray(train_question_labels[:cutoff_train]),
          epochs=5,
          validation_split=0.1) # to detect if any overfitting is occurring

In [None]:
evaluate_classifier(model, X_test, test_question_labels[:cutoff_test])