In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from nltk import TextTilingTokenizer

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [None]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [None]:
%aimport src.data.delicious_t140
%aimport src.helpers.labels
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics, src.utils.distances, src.utils.plotting,src.helpers.embeddings

In [None]:
from src.features.delicious_t140 import clean_text_delicious
from src.data.delicious_t140 import get_sample_from_cache
from src.helpers.labels import truncate_labels
from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking
from src.utils.clusters import k_medoids
from src.utils.distances import hausdorff
from src.utils.plotting import plot_micro_f1_at_k

from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments
from src.helpers.embeddings import read_glove_wiki_weighted

In [None]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/delicious-mimlsvm/")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/delicious-t140/")

MAX_NB_WORDS = 500
SEED= 42
MIN_TAG_DF=10
SAMPLE_FRAC=10
EMBEDDINGS_DIM=100
W=20 # Pseudosentence size (in words) - not specified in the paper, taken from TextTiling default values
K=10 # Size (in sentences) of the block used in the block comparison method - not specified in the paper, taken from TextTiling default values

In [None]:
np.random.seed(SEED)

In [None]:
%%time

cache_path = INTERIM_DATA_ROOT+"/docs_df_with_segments-{}-sample-{}.p".format(MAX_NB_WORDS,SEED)

if os.path.isfile(cache_path):
    print('cache hit')
    docs_df = pickle.load(open(cache_path,"rb"))
else:
    print('cache miss. run again mimlsvm-tf-idf')

In [None]:
segments = docs_df['segments'].values
documents = docs_df['contents'].values
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = truncate_labels(labels,MIN_TAG_DF)

In [None]:
# segments, documents and labelsets are defined outside of the parameterGrid loop
# because they're the same for every configuration    
segments_train, segments_val, documents_train, documents_val, Y_train, Y_val = train_test_split(segments,
                                                                                               documents,
                                                                                               labels,
                                                                                               test_size=0.15)

mlb = MultiLabelBinarizer()
mlb.fit(labels)

Y_train = mlb.transform(Y_train)
Y_val = mlb.transform(Y_val)

print('total number of train documents: {}'.format(len(documents_train)))
print('total number of validation documents: {}'.format(len(documents_val)))
print("total number of unique tags: {} ".format(len(mlb.classes_)))

In [None]:
vect = TfidfVectorizer(max_features=MAX_NB_WORDS)
vect.fit(documents)

feature_names = vect.get_feature_names()
idf = vect.idf_
idf_index = dict(zip(vect.get_feature_names(), idf))

In [None]:
# analyzer = preprocess + tokenize
tokenize_func = vect.build_analyzer()

def tokenize(string):
    return tokenize_func(string)

In [None]:
%%time
X_train_tok = list()

for document_segments in segments_train:
    tokenized_segments = [tokenize(segment) for segment in document_segments]
    X_train_tok.append(tokenized_segments)

In [None]:
%%time
X_val_tok = list()

for document_segments in segments_val:
    tokenized_segments = [tokenize(segment) for segment in document_segments]
    X_val_tok.append(tokenized_segments)

### transform into embeddings

In [None]:
embeddings_index = read_glove_wiki_weighted(
    d=EMBEDDINGS_DIM,
    weight_index=idf_index)

In [None]:
def build_bag_of_weighted_embeddings(tokens): 
    out = [embeddings_index[token] for token in tokens if token in embeddings_index.keys()]
    return np.mean(np.array(out),axis=0)    

In [None]:
cache_path = INTERIM_DATA_ROOT+"/mimlsvm/mimlsvm-embeddings/X_train_boe_segments-sample-{}.p".format(SAMPLE_FRAC)

if os.path.isfile(cache_path):
    print('cache hit')
    X_train_boe_segments = pickle.load(open(cache_path,'rb'))
else:
    X_train_boe_segments = list()

    for tokenized_document_segments in X_train_tok:
        document_segments_boe = list()

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for seg in tokenized_document_segments:
                if len(seg) == 0:
                    print('empty segment')
                    continue
                else:
                    try:
                        boe=build_bag_of_weighted_embeddings(seg)
                    except RuntimeWarning:
                        print('segment entirely with OOV words')
                        continue

                    document_segments_boe.append(boe)

        X_train_boe_segments.append(document_segments_boe)

    pickle.dump(X_train_boe_segments,open(cache_path,"wb"))

In [None]:
cache_path = INTERIM_DATA_ROOT+"/mimlsvm/mimlsvm-embeddings/X_val_boe_segments-sample-{}.p".format(SAMPLE_FRAC)

if os.path.isfile(cache_path):
    print('cache hit')
    X_val_boe_segments = pickle.load(open(cache_path,'rb'))
else:
    X_val_boe_segments = list()

    for tokenized_document_segments in X_val_tok:
        document_segments_boe = list()

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for seg in tokenized_document_segments:
                if len(seg) == 0:
                    print('empty segment')
                    continue
                else:
                    try:
                        boe=build_bag_of_weighted_embeddings(seg)
                    except RuntimeWarning:
                        print('segment entirely with OOV words')
                        continue

                    document_segments_boe.append(boe)

        X_val_boe_segments.append(document_segments_boe)

    pickle.dump(X_val_boe_segments,open(cache_path,"wb"))

In [None]:
def make_distance_matrix_for_embedding_segments(vectorized_segments, distance='hausdorff'):
    """
    Returns the distance matrix for the documents having the given segments.

    :param vectorized_segments: array of size M, where each element is a "bag" of segments: matrix of
        shape (*,NUM_FEATURES), and each row on this matrix is the TF-IDF vector for one segment.
    :param distance: how to compare the two bags
    :return: pairwise distance matrix (MxM matrix)
    """
    if distance.lower().strip() != 'hausdorff':
        raise Exception("Only 'hausdorff' distance supported right now.")

    num_samples = len(vectorized_segments)

    distance_function = lambda a, b: hausdorff(a, b)

    distance_matrix = np.zeros((num_samples, num_samples))

    for i, segments_for_document_a in enumerate(vectorized_segments):
        for j, segments_for_document_b in enumerate(vectorized_segments):
            distance = distance_function(segments_for_document_a, segments_for_document_b)
            distance_matrix[i][j] = distance

    return distance_matrix

In [None]:
def make_train_dataset(distance_matrix, medoid_indices):
    """
    Returns a matrix where element Aij contains the distance from sample i to medoid j.

    :param distance_matrix: MxM matrix with pairwise distances
    :param medoid_indices: array of length N containing the indices of the medoids for each cluster
    :return: distances to medoids (MxN matrix)
    """

    return distance_matrix[:,medoid_indices]

In [None]:
def make_validation_dataset(source_vectorized_segments, medoid_vectorized_segments):
    """
    Calculates the distances from every source_document (reprsented by its segments) to every medoid
    document (also represented by its segments) using the hausdorff distance.
    
    Returns a matrix where element Aij contains the distance from sample i to medoid j.

    :param source_vectorized_segments: array of length M, where each element is a matrix with one row
        for every segment in a source document
    :param medoid_vectorized_segments: array of length N where each element is a matrix with one row
        for every segment in a medoid document
    :return: distances to medoids (MxN matrix)
    """
    
    num_test_samples = len(source_vectorized_segments)
    num_medoids = len(medoid_vectorized_segments)
    
    test_dataset = np.zeros((num_test_samples,num_medoids))    
    
    for i,source_segments in enumerate(source_vectorized_segments):
        for j,medoid_segments in enumerate(medoid_vectorized_segments):
            test_dataset[i][j] = hausdorff(source_segments.toarray(),medoid_segments.toarray())
            
    return np.array(test_dataset)

### these were found by grid search

In [None]:
final_parameters = [
    {
        'medoid_normalization':  [None],
        'svm_kernel': ['poly'],
        'svm_c':[1.0],
        'svm_degree' :[3],
        'svm_gamma':['auto'],
        'vectorizer_norm': [None],
        'nb_medoids_ratio': [0.2],
        'max_features':[500]
    }
]

In [None]:
import warnings
warnings.filterwarnings('ignore')

for (i,configuration) in tqdm(enumerate(ParameterGrid(final_parameters))):
                 
    # nb_medoids depends upon the dataset length
    ratio = configuration['nb_medoids_ratio']
    nb_medoids = int(len(documents_train) * ratio)
    
    # these are the document segments to be used as medoids
    medoids_indices_train = k_medoids(dist_matrix_train,nb_medoids)[0]

    # a matrix where element Aij contains the distance from sample i to medoid j.
    X_train = make_train_dataset(dist_matrix_train,medoids_indices_train)
    
    # VALIDATION SET

    fitted_medoids = list()
    for medoid_index in medoids_indices_train:
        fitted_medoids.append(X_train_boe_segments[medoid_index]) 
    
    X_val = make_validation_dataset(X_val_boe_segments,fitted_medoids)     
        
    svm = SVC(kernel=configuration['svm_kernel'],
            gamma=configuration['svm_gamma'],
            C=configuration['svm_c'],
            degree=configuration['svm_degree'])
    
    clf = OneVsRestClassifier(CalibratedClassifierCV(svm,cv=2),n_jobs=-1)        

    if configuration['medoid_normalization'] == 'standard':      
        scaler = StandardScaler()
        X_train_final = scaler.fit_transform(X_train)
        X_val_final = scaler.transform(X_val)
    elif configuration['medoid_normalization'] == 'minmax':
        scaler = MinMaxScaler()
        X_train_final = scaler.fit_transform(X_train)
        X_val_final = scaler.transform(X_val)
    else:
        X_train_final = X_train
        X_val_final = X_val
    
    # y_train was defined outside the loop    
    clf.fit(X_train,Y_train)
    
    # train score
    Y_pred_train = clf.predict_proba(X_train)
    
    # validation score
    Y_pred_val = clf.predict_proba(X_val)  
    
    print("iter: {}, configuration: {}\n".format(i,configuration))
    
    ks = [1,2,3,4,5,6,7,8,9,10]

    for k in ks:
#         print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
        print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))    

In [None]:
for (i,configuration) in enumerate(ParameterGrid(final_parameters)):
    
    tfidf_vectorizer = CountVectorizer(
        max_features=configuration['max_features'], 
        norm=configuration['vectorizer_norm'])
    
    # TRAINING SET
    tfidf_vectorizer.fit(documents_train)
    tfidf_segments_train = vectorize_segments(segments_train, tfidf_vectorizer)
        
    # THE FOLLOWING BLOCK TAKES SOME TIME, BUT IT WILL ONLY RUN ONCE
    
    path_to_cache = MODELS_ROOT.rstrip('/') + "/distance-matrix-train-{}-{}-{}.p".format(
        configuration['max_features'],
        configuration['vectorizer_norm'],
        SAMPLE_FRAC)
    
    if os.path.isfile(path_to_cache):
        print('cache hit')
        dist_matrix_train = pickle.load(open(path_to_cache,"rb"))
    else:
        print('Fitting distance matrix for norm={}'.format(configuration['vectorizer_norm']))
        
        dist_matrix_train = make_distance_matrix_for_segments(tfidf_segments_train)
        pickle.dump(dist_matrix_train, open(path_to_cache, "wb"))
    
    # nb_medoids depends upon the dataset length
    ratio = configuration['nb_medoids_ratio']
    nb_medoids = int(len(tfidf_segments_train) * ratio)
    
    medoids_indices_train = k_medoids(dist_matrix_train,nb_medoids)[0]

    X_train = make_train_dataset(dist_matrix_train,medoids_indices_train)
    
    # TEST SET
    
    tfidf_segments_test = vectorize_segments(segments_test, tfidf_vectorizer)
          
    # medoids trained on the training set
    fitted_medoids = tfidf_segments_train[medoids_indices_train]
    X_test = make_test_dataset(tfidf_segments_test,fitted_medoids)     
        
    svm = SVC(kernel=configuration['svm_kernel'],
            gamma=configuration['svm_gamma'],
            C=configuration['svm_c'],
            degree=configuration['svm_degree'])
    
    clf = OneVsRestClassifier(CalibratedClassifierCV(svm,cv=2),n_jobs=-1)        

    if configuration['medoid_normalization'] == 'standard':      
        scaler = StandardScaler()
        X_train_final = scaler.fit_transform(X_train)
        X_test_final = scaler.transform(X_test)
    elif configuration['medoid_normalization'] == 'minmax':
        scaler = MinMaxScaler()
        X_train_final = scaler.fit_transform(X_train)
        X_test_final = scaler.transform(X_test)
    else:
        X_train_final = X_train
        X_test_final = X_test
    
    # y_train was defined outside the loop    
    clf.fit(X_train,Y_train)
    
    # train score
    Y_pred_train = clf.predict_proba(X_train)
    
    # validation score
    Y_pred_test = clf.predict_proba(X_test)  
    
    print("iter: {}, configuration: {}\n".format(i,configuration))
    
    ks = [1,2,3,4,5,6,7,8,9,10]

    for k in ks:
        print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
        print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_test,Y_pred_test,k=k,normalize=True)))    

In [None]:
plt.clf()
img = plt.gcf()
ax = plt.gca()
validation_scores = [

]
plot_micro_f1_at_k(validation_scores,ax)
plt.gcf().set_size_inches(7,5)
plt.gca().legend_.remove()
plt.show()