In [None]:
import pandas as pd
import numpy as np
import pickle
import itertools
import functools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.utils import simple_preprocess

import itertools

import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
merged = pd.read_pickle('merged.pkl')

## Hyperparameters here

In [None]:
# Sampling
m_size = 10000

# Stratified Test Train Split
test_size = 0.2

# Doc2Vec hyperparameters
vector_size = 400
min_count = 2
epochs = 20
window = 10

#Embedding Inferrence
steps=20

## Functions

In [None]:
def sampleData(df, m_size):
    """
    Sample the data with hyperparameter m_size specifying how many training examples we want
    Hyperparameters: m_size
    
    :param df: Input Pandas DataFrame
    :type df: pd.DataFrame
    :param m_size: Number of training examples desired
    :type m_size: int
    
    :return: X, y of combined sampled training data and labels for the model
    :rtype: List*2
    """
    
    pos = df[df['label'] == 1]
    neg = df[df['label'] == 0].sample(m_size, random_state=42)
    data = neg.append(pos)
    combined = [(h + ' ' + s + ' ' + b, l) for h, s, b, l in 
                    zip(list(data['headline_x']), list(data['summary_x']), list(data['body_x']), list(data['label']))]
    print('Sampling Done')
    return zip(*combined)



def stratSpl(X, y, test_size):
    """
    Make a stratified test/train split to use for training and testing.
    Hyperparameters: None
    
    :param X: Input features of the combined (train and test) sampled set. 
    :type X: List
    :param y: Input labels of the combined (train and test) sampled set
    :type y: List
    :param test_size: Test ratio to split up. Number between 0 and 1
    :type test_size: Float
    
    :return: 4 Lists corresponding to X_tr, X_te, y_tr, y_te
    :rtype: List*4
    """
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    for train_index, test_index in sss.split(X, y):
        X_tr, X_te = [X[i] for i in train_index], [X[i] for i in test_index]
        y_tr, y_te = [y[i] for i in train_index], [y[i] for i in test_index]
    
    print('Stratified test/train split done')
    return X_tr, y_tr, X_te, y_te



def prepare(X_tr, X_te):
    """
    Prepare the data (using gensims simple_preprocess)
    Hyperparameters: None
    
    :param X_tr: Training features
    :type X_tr: List
    :param X_te: Testing features
    :type X_te: List
    
    :return: Processed version of X_tr and X_te.
    :rtype: List*2
    
    """

    def read_corpus(data):
        for i, line in enumerate(data):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), tags=[i])
    
    train_corpus = list(read_corpus(X_tr))
    test_corpus = list(read_corpus(X_te))
    
    print('Data preparation done')
    
    return train_corpus, test_corpus


def doc2vec_model_train(X_tr, vector_size, min_count, epochs, window):
    """
    Doc2Vec model defined and trained via this function. 
    Hyperparameters: size, min_count, epochs, window
    
    :param X_tr: Training data
    :type X_tr: List
    :param vector_size: Dimensionality of the feature vectors 
    :type vector_size: Int
    :param min_count: Minimum occurences for which to still keep a word in the vocab.
    :type min_count: Int
    :param epochs: Number of epochs for which the model trains
    :type epochs: Int
    :param window: Window size of context to consider in a given instance.
    :type window: Int
    
    :return: Fully trained model
    :rtype: gensim model
    """
    
    model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, window=window, epochs=epochs)
    model.build_vocab(X_tr)
    model.train(X_tr, total_examples=model.corpus_count, epochs=model.epochs)
    
    print('Doc2Vec Model Trained')
    
    return model


def embeddings(model, X, steps):
    
    """
    Embed documents into vector space for classification in the next stage.
    Hyperparameters: steps
    
    :param model: Trained Doc2Vec model
    :type model: gensim Doc2Vec model
    :param X: Input corpus
    :type X: List of TaggedDocuments
    :param steps: Hyperparameter to tune
    :type steps: Int
    
    :return: Embedded feature vector
    :rtype: List
    """
    
    z = [model.infer_vector(X[doc_id].words, steps=steps) for doc_id in range(len(X))]
    
    print('Documents embedded into vector space')
    
    return z


def FinalClassifier(X_tr, y_tr):
    
    """
    Models for final classifcation, will be hyperparameters
    Hyperparameters: The models themselves and their hyperparameters *Come back here for alteration
    
    :param X_tr: Input document vectors 
    :type X_tr: List
    :param y_tr: Labels
    :type: List
    
    :return: Trained logreg model
    :rtype: 
    """
    clf = LogisticRegression(random_state=42).fit(X_tr, y_tr)
    print('Final Classifier Trained.')
    
    return clf
    

def precision(conf):
    num = conf[0][1]
    den = num + conf[1][1]

    return num/den

def recall(conf):
    num = conf[1][1]
    den = num + conf[1][0]
    
    return num/den

def F1(P, R):
    return 2 * P*R/(P+R)

def average(l):
    return functools.reduce(lambda x, y: x + y, l) / len(l)

## Full Training Pipeline

In [None]:
X, y = sampleData(merged, m_size)

X_tr, y_tr, X_te, y_te = stratSpl(X, y, test_size)

X_tr, X_te = prepare(X_tr, X_te)

d2v = doc2vec_model_train(X_tr, vector_size, min_count, epochs, window)

X_tr = embeddings(d2v, X_tr, steps)

clf = FinalClassifiers(X_tr, y_tr)   

## Test Model

In [None]:
X_te = embeddings(d2v, X_te, steps)

y_pr = clf.predict(X_te)

y_sc = clf.decision_function(X_te)

conf = confusion_matrix(y_te, y_pr)

conf

In [None]:
P, R = precision(conf), recall(conf)

In [None]:
ef1 = F1(P,R)

In [None]:
average_precision = average_precision_score(y_te, y_sc)

In [None]:
P, R, ef1, average_precision

In [None]:
## Plot precision vs recall

average_precision = average_precision_score(y_te, y_sc)

precision, recall, _ = precision_recall_curve(y_te, y_sc)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))