In [1]:
import pandas as pd
import numpy as np
import pickle
import itertools
import functools
import collections

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.utils import simple_preprocess

import itertools

import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
merged = pd.read_pickle('merged.pkl')

## Parameter grid

In [3]:
# params = {'m_size': [2000, 5000, 10000],
#               'test_size': [0.2],
#               'vector_size': [200, 400, 600],
#               'min_count': [2],
#               'epochs': [20],
#               'window': [5, 10, 15],
#               'steps': [1, 20]
#              }

params = {'m_size': [10000],
         'test_size': [0.2],
         'vector_size': [400],
         'min_count': [2],
         'epochs': [20],
         'window': [10], 
         'steps': [20]}

## Functions

In [4]:
def sampleData(df, m_size):
    """
    Sample the data with hyperparameter m_size specifying how many training examples we want
    Hyperparameters: m_size
    
    :param df: Input Pandas DataFrame
    :type df: pd.DataFrame
    :param m_size: Number of training examples desired
    :type m_size: int
    
    :return: X, y of combined sampled training data and labels for the model
    :rtype: List*2
    """
    
    pos = df[df['label'] == 1]
    neg = df[df['label'] == 0].sample(m_size, random_state=42)
    data = neg.append(pos)
    combined = [(h + ' ' + s + ' ' + b, l) for h, s, b, l in 
                    zip(list(data['headline_x']), list(data['summary_x']), list(data['body_x']), list(data['label']))]
    print('Sampling Done')
    return zip(*combined)



def stratSpl(X, y, test_size):
    """
    Make a stratified test/train split to use for training and testing.
    Hyperparameters: None
    
    :param X: Input features of the combined (train and test) sampled set. 
    :type X: List
    :param y: Input labels of the combined (train and test) sampled set
    :type y: List
    :param test_size: Test ratio to split up. Number between 0 and 1
    :type test_size: Float
    
    :return: 4 Lists corresponding to X_tr, X_te, y_tr, y_te
    :rtype: List*4
    """
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    for train_index, test_index in sss.split(X, y):
        X_tr, X_te = [X[i] for i in train_index], [X[i] for i in test_index]
        y_tr, y_te = [y[i] for i in train_index], [y[i] for i in test_index]
    
    print('Stratified test/train split done')
    return X_tr, y_tr, X_te, y_te



def read_corpus(data):
    """
    Prepare the data (using gensims simple_preprocess)
    Hyperparameters: None
    
    :param data: Document 
    :type X_tr: List
    
    :return: Processed version data.
    :rtype: Iterator
    
    """
    
    print('Tokenizing data')
    
    for i, line in enumerate(data):
        yield TaggedDocument(simple_preprocess(line), tags=[i])
    


def doc2vec_model_train(X_tr, vector_size, min_count, epochs, window):
    """
    Doc2Vec model defined and trained via this function. 
    Hyperparameters: size, min_count, epochs, window
    
    :param X_tr: Training data
    :type X_tr: List
    :param vector_size: Dimensionality of the feature vectors 
    :type vector_size: Int
    :param min_count: Minimum occurences for which to still keep a word in the vocab.
    :type min_count: Int
    :param epochs: Number of epochs for which the model trains
    :type epochs: Int
    :param window: Window size of context to consider in a given instance.
    :type window: Int
    
    :return: Fully trained model
    :rtype: gensim model
    """
    
    model = Doc2Vec(vector_size=vector_size, min_count=min_count, window=window, epochs=epochs)
    model.build_vocab(X_tr)
    model.train(X_tr, total_examples=model.corpus_count, epochs=model.epochs)
    
    print('Doc2Vec Model Trained')
    
    return model


def embeddings(model, X, steps):
    
    """
    Embed documents into vector space for classification in the next stage.
    Hyperparameters: steps
    
    :param model: Trained Doc2Vec model
    :type model: gensim Doc2Vec model
    :param X: Input corpus
    :type X: List of TaggedDocuments
    :param steps: Hyperparameter to tune
    :type steps: Int
    
    :return: Embedded feature vector
    :rtype: List
    """
    
    z = [model.infer_vector(X[doc_id].words, steps=steps) for doc_id in range(len(X))]
    
    print('Documents embedded into vector space')
    
    return z


def FinalClassifier(X_tr, y_tr):
    
    """
    Models for final classifcation, will be hyperparameters
    Hyperparameters: The models themselves and their hyperparameters *Come back here for alteration
    
    :param X_tr: Input document vectors 
    :type X_tr: List
    :param y_tr: Labels
    :type: List
    
    :return: Trained logreg model
    :rtype: 
    """
    clf = LogisticRegression(random_state=42).fit(X_tr, y_tr)
    print('Final Classifier Trained.')
    
    return clf
    
    
def cross_val(clf, X_tr, y_tr):
    """
    Cross validation on training set. This will be used to score the grid search models and tune hyperparameters.
    Hyperparameters: None
    
    :param clf: Second stage classifier
    :type clf: Sklearn (or other) classifier
    :param X_tr: Training data emedded doc vectors
    :type X_tr: List
    :param y_tr: Training labels
    :type y_tr: 
    
    """
    scoring = ['f1', 'precision', 'recall', 'average_precision']
    
    scores = cross_validate(clf, X_tr, y_tr, cv=3, scoring=scoring)
    
    print('Cross val scores computed for this set of params.')
    
    return scores
    

def precision(conf):
    num = conf[0][1]
    den = num + conf[1][1]

    return num/den

def recall(conf):
    num = conf[1][1]
    den = num + conf[1][0]
    
    return num/den

def F1(P, R):
    return 2 * P*R/(P+R)

def average(l):
    return functools.reduce(lambda x, y: x + y, l) / len(l)

def flatten(x):
    if isinstance(x, collections.Iterable) and not isinstance(x, tuple) and not isinstance(x, str) and not isinstance(x, dict):
            return [a for i in x for a in flatten(i)]
    else:
        return [x]

In [5]:
def unpack_kwargs(**kwargs):
    m_size = kwargs.pop('m_size')
    test_size = kwargs.pop('test_size')
    vector_size = kwargs.pop('vector_size')
    min_count = kwargs.pop('min_count')
    epochs = kwargs.pop('epochs')
    window = kwargs.pop('window')
    steps = kwargs.pop('steps')
    
    return m_size, test_size, vector_size, min_count, epochs, window, steps

## Full Pipeline

In [6]:
def full_pipeline(scores, **kwargs):
    
    
    # Change in case more HP
    m_size, test_size, vector_size, min_count, epochs, window, steps = unpack_kwargs(**kwargs)
    
    X, y = sampleData(merged, m_size)

    X_tr, y_tr, _, _ = stratSpl(X, y, test_size)

    X_tr = list(read_corpus(X_tr))
    
    ## start K fold HERE!
    
    skf = StratifiedKFold(n_splits=5, random_state=42)
    
    temp = []
    print('Cross validation commencing...')
    i = 0
    for train_index, test_index in skf.split(X_tr, y_tr):
        
        print('Split %r...' % i)
        
        X_tr_cv, X_te_cv = [X_tr[i] for i in train_index], [X_tr[i] for i in test_index]
        y_tr_cv, y_te_cv = [y_tr[i] for i in train_index], [y_tr[i] for i in test_index]
        
        d2v = doc2vec_model_train(X_tr_cv, vector_size, min_count, epochs, window)
        
        X_tr_cv = embeddings(d2v, X_tr_cv, steps)
        
        clf = FinalClassifier(X_tr_cv, y_tr_cv)
        
        X_te_cv = embeddings(d2v, X_te_cv, steps)
        
        y_pr, y_sc = clf.predict(X_te_cv), clf.decision_function(X_te_cv)
        
        conf = confusion_matrix(y_te_cv, y_pr)
        print(conf)
        
        p, r = precision(conf), recall(conf)
        f1, ap = F1(p, r), average_precision_score(y_te_cv, y_sc)
        
        temp.append([p, r, f1, ap])
        
        i += 1
        
    scores.append(temp)
    
    print('---------------------------------------------')
    
    return scores

## Grid Search

In [7]:
def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    
    for instance in itertools.product(*vals):
        yield dict(zip(keys, instance))
        
results = {}
scores=[]

for i, param in enumerate(list(product_dict(**params))):
    print('Checking set %r of parameters...' % i)
    
    scores = full_pipeline(scores, **param)
    
    results[i] = flatten([param, list(zip(*scores[i]))])

Checking set 0 of parameters...
Sampling Done
Stratified test/train split done
Tokenizing data
Cross validation commencing...
Split 0...
Doc2Vec Model Trained
Documents embedded into vector space
Final Classifier Trained.
Documents embedded into vector space
[[1531   69]
 [  38   47]]
Split 1...
Doc2Vec Model Trained
Documents embedded into vector space
Final Classifier Trained.
Documents embedded into vector space
[[1536   64]
 [  29   56]]
Split 2...
Doc2Vec Model Trained
Documents embedded into vector space
Final Classifier Trained.
Documents embedded into vector space
[[1547   53]
 [  29   56]]
Split 3...
Doc2Vec Model Trained
Documents embedded into vector space
Final Classifier Trained.
Documents embedded into vector space
[[1535   65]
 [  31   53]]
Split 4...
Doc2Vec Model Trained
Documents embedded into vector space
Final Classifier Trained.
Documents embedded into vector space
[[1535   65]
 [  32   52]]
------------------------


## Print Cross Val Results (all splits)

In [68]:
data = [[key] + [val for val in vals] for key, vals in results.items()]
    

pr = pd.DataFrame(data, columns=['Model #', 'Parameters', 'Precision', 'Recall',
                                 'F1', 'Average Precision'])

pd.set_option('display.max_colwidth', -1)

pr

Unnamed: 0,Model #,Parameters,Precision,Recall,F1,Average Precision
0,0,"{'m_size': 10000, 'test_size': 0.2, 'vector_size': 400, 'min_count': 2, 'epochs': 20, 'window': 10, 'steps': 20}","(0.5948275862068966, 0.5333333333333333, 0.48623853211009177, 0.5508474576271186, 0.5555555555555556)","(0.5529411764705883, 0.6588235294117647, 0.6588235294117647, 0.6309523809523809, 0.6190476190476191)","(0.5731200848281347, 0.5894736842105264, 0.5595249316617965, 0.5881850776848216, 0.5855855855855856)","(0.5248189136573098, 0.6500365275103567, 0.671273841553746, 0.583868009795207, 0.5999321680941284)"


## Print Cross Val Results(average)

In [69]:
pr_av = pr.copy()

In [71]:
pr_av['Precision'] = pr_av['Precision'].apply(average)

In [73]:
pr_av['Recall'] = pr_av['Recall'].apply(average)
pr_av['F1'] = pr_av['F1'].apply(average)
pr_av['Average Precision'] = pr_av['Average Precision'].apply(average)

In [74]:
pr_av

Unnamed: 0,Model #,Parameters,Precision,Recall,F1,Average Precision
0,0,"{'m_size': 10000, 'test_size': 0.2, 'vector_size': 400, 'min_count': 2, 'epochs': 20, 'window': 10, 'steps': 20}",0.54416,0.624118,0.579178,0.605986
