# Classification  - Testing K Nearest Neighbors
## Try this model with word vectors of candidates generated via word2vec
## First test on candidates from the 50-document test set
## Then test the KNN classifier on polyNER's candidates from the 100 ground truth documents 
## Finally test on classifying a test set of manually extracted polymers from the ground truth dcuments (only true positives)

In [None]:
# -*- coding: utf-8 -*-
import os
import re
import sys
import csv
import math
import pickle
import numpy as np
import pandas as pd
import sqlite3
from   __future__ import division
import matplotlib.pyplot as plt
from   matplotlib.colors import ListedColormap
from   sklearn import neighbors, datasets
import gensim, logging

from   sklearn import svm
from   sklearn.svm import SVR
from   sklearn.svm import SVC
from   sklearn.metrics import f1_score, precision_score, recall_score
import sklearn
import spacy

from   sklearn.model_selection import train_test_split
from   sklearn.model_selection import GridSearchCV
from   sklearn.metrics import classification_report
from   sklearn.neural_network import MLPClassifier

from   sklearn.manifold import TSNE

from   sklearn.ensemble import RandomForestRegressor
from   sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
import scipy as sp
import cPickle as pkl
import numpy as np
import matplotlib.pyplot as plt

from   sklearn.ensemble import RandomForestClassifier

# This log shows progress and is very useful
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load spacy for candidate processing
nlp = spacy.load("en_core_web_sm")
# Adding these into the vocabulary
nlp.vocab[u"diblock"].is_stop = False
nlp.vocab[u"g/mol"].is_stop   = False
nlp.vocab[u"kg/mol"].is_stop  = False

In [None]:
# Connect to db
def connect_to_db():
    database = "../../db/sentences.db"
    conn = create_connection(database)
    return conn

def create_connection(db_file):
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
    return None

In [None]:
# Get candidate labels
def get_candidate_labels_from_db(conn, process_full_document_flag):
    # read candidates from just fasttext-classified sentences
    pipeline_polymers = readcsv('../candidates/perdocformat/classifier_pipeline_candidates.csv')
    
    # Store candidates in dict
    candidates = {}
    cur = conn.cursor()
    cur.execute("select distinct polymer, is_poly from polymer_ner_evaluation") 
    rows = cur.fetchall()

    for row in rows:
        if process_full_document_flag:
            candidates[row[0]] = row[1]
        else:
            if row[0] in pipeline_polymers:
                candidates[row[0]] = row[1]
    return candidates

# Read candidate input files
def readcsv(ifile):
    polymer_candidates = []
    with open(ifile, 'rb') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            new_row = [unicode(cell, 'utf-8') for cell in row]
            #new_ro = [cleanup_poly(x) for x in new_row]
            polymer_candidates.extend(new_row[1:])
    return list(set(polymer_candidates))

In [None]:
def get_training_testing_data(features, target):
    # Dataset, training and testing datasets
    X = np.asarray(features)
    y = np.asarray(target)
    Xtrain, Xtest, ytrain, ytest = train_test_split(
        X, y, test_size=0.1, random_state=0)
    return Xtrain, Xtest, ytrain, ytest

In [None]:
# Get metrics - I think I can do that with scikit learn
def metrics(predicted, actual):  
    true_pos = 0
    false_pos = 0
    false_neg = 0
    true_neg = 0
    num_pos = 0
    num_polys = 0
    for i in range(len(predicted)):
        if predicted[i] == 1 and actual[i] == 1:
            true_pos = true_pos + 1
        elif predicted[i] == 1 and actual[i] == 0:
            false_pos = false_pos + 1
        elif predicted[i] == 0 and actual[i] == 0:
            true_neg = true_neg + 1
        elif predicted[i] == 0 and actual[i] == 1:
            false_neg = false_neg + 1

    print('    Test points:     %d' % len(predicted))
    print('    True positives:  %d' % true_pos)
    print('    False positive:  %d' % false_pos)
    print('    True negatives:  %d' % true_neg)
    print('    False negatives: %d' % false_neg)
    if false_pos+true_pos > 0:
        precision = true_pos/(true_pos+false_pos)

        recall = true_pos/(true_pos+false_neg)
        accuracy = (true_pos + true_neg)/(true_pos+true_neg+false_pos+false_neg)
        f1score = 2/((1/recall)+(1/precision))
    else: #FIXME
        precision = 0
        recall = 1
        f1score = -1 #FIXME: check 
    print('    Precision:       %.3f' % precision)
    print('    Recall:          %.3f' % recall)
    #print "Accuracy: ", accuracy / just to check my metrics function was correct
    print('    F-1 score:       %.3f' %f1score)
    #print clf.score(predicted,actual)
    return f1score

In [None]:
# Check if a string is a number
def is_number(n):
    try:
        float(n)
        return True
    except ValueError:
        return False

# First use vectors as features
def refine_candidate(candidate):
    # Top context words in DB
    frequent_context_words = ["weight","material","system","chains","samples", "systems","Tg","weights","comparison","chromatography","Mn","THF","toluene","GPC","chloroform","index","Column","columns","standards","reference","segments","polydispersity","substrate","block","components","permeation","component","Mw","bulk","standard","calibration","dynamics","cross-linked","cells","domains","segment","mixtures","densities","substrates","well-defined","silica","SEC","particles","compositions","surfaces","linear"]
    
    common_polys = ['polyethylene', 'polyurethane', 'polypropylene', 'polyester', 'PS', 'polystyrene', 'PLA', 'PI', 'PET', 'PVP', 'PEG', 'cellulose', 'PAN', 'methyl'] #These are polymers that could appear within spacy vocab
    common_polys = [polymer.lower() for polymer in common_polys] 

    # Filter out junk values
    junk_vals = []
        
    if (candidate in nlp.vocab) and candidate.lower() not in common_polys:
        return "ignore"
    vocab_obj = model.wv.vocab[candidate]
    freq= vocab_obj.count
 
    if candidate in frequent_context_words:
        return "ignore"
            
    junk = False
    items = re.split(' |:|;|-',candidate)
    for item in items:
        #Removing items that are sentences within  parenthesis
        if item != "poly" and is_number(item)==False and ("standard" in item or (item in nlp.vocab and item not in common_polys)):
            junk = True
            break

    if junk is True:
        return "ignore"
    
    return candidate

In [None]:
# First use vectors as features
def get_word_vectors_as_features(polymer_candidates, refined_candidates_only_flag):
    features = {}
    Xl = [] # vectors
    yl = [] # target (is_poly=0:1?)
    ll = [] # labels
    for k, v in polymer_candidates.iteritems():
        if not refined_candidates_only_flag:
            ll.append(k)
            features[k] = word_vectors[k]
            Xl.append(features[k])
            yl.append(polymer_candidates[k])
        else:
            new_k = refine_candidate(k)
            if new_k != "ignore":
                ll.append(k)
                features[k] = word_vectors[k]
                Xl.append(features[k])
                yl.append(polymer_candidates[k])
                
    return Xl, yl


In [None]:
# Select data to work with, subject to these flags:
#   process_full_document_flag:   Work with all sentences from a document (True) or only fasttext selected sentences (False)
#   refined_candidates_only_flag: Refine the candidates (remove english words, remove frequent context words etc): True or False
#   use_word_vector_flag:         Use word vectors (True) or similarity to PS scores (False)
def select_fulldoc_refined_featuretype(conn, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag):
    poly_candidates = get_candidate_labels_from_db(conn, process_full_document_flag)

    if use_word_vector_flag:
        X, y = get_word_vectors_as_features(poly_candidates, refined_candidates_only_flag)
    else:
        X, y = get_similarity_scores_as_features(poly_candidates, refined_candidates_only_flag)

    return X,y

In [None]:
# Run all classifiers on a particular type of input
def run_knn_classifiers(X,y):
    selected_model = 'K Nearest Neighbor'
    
    X_train, X_test, y_train, y_test = get_training_testing_data(X,y)
    
    print('K Nearest Neighbor:')
    f1_score, clf = knn(X_train, X_test, y_train, y_test)
    

    return(selected_model, f1_score, clf)

In [None]:
# Load the pre trained gensim model
model = gensim.models.Word2Vec.load('../../models/gensim_cbow.bin')

In [None]:
# Get word vectors
word_vectors = model.wv
# Get vocab
vocabulary = list(model.wv.vocab)

In [None]:
# Define KNN, SVC, and RF models

def knn(Xtrain,Xtest, ytrain, ytest):
    # Number of neighbors 5 seems to work best
    n_neighbors = 5

    #for weights in ['uniform', 'distance']:
    weights = 'uniform'
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    clf.fit(Xtrain,ytrain)

    y_predicted = clf.predict(Xtest)
    f1s = metrics(y_predicted,ytest)
    
    return f1s, clf


def run_all_knn_models(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag):
    print('Running all classifiers, with %s candidates; %s; %s' % 
      ('refined' if refined_candidates_only_flag else 'unrefined',
       'word vectors' if use_word_vector_flag else 'score vectors',
       'full documents' if process_full_document_flag else 'classified sentences'))

    X, y = select_fulldoc_refined_featuretype(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag)
    
    best_model_name, max_f1_score, best_model_bin = run_knn_classifiers(X, y)
    key = ('fulldoc' if process_full_document_flag else 'classified_sentences') + '_' + ('refined' if refined_candidates_only_flag else 'unrefined') + '_' + ('words' if use_word_vector_flag else 'scores')
    results[key] = [best_model_name, max_f1_score, best_model_bin]
    return

## Generate and run knn models four times

Run each of the classifiers for the following 2 x 2 x 2 = 8 configurations:
1. (all docs vs. fasttext-selected)
1. (refined vs. unrefined)

In [None]:
# Keep a list of best models and best F scores
results = {}

In [None]:
# Connects to db
connection = connect_to_db()

In [None]:
process_full_document_flag   = True
refined_candidates_only_flag = False
use_word_vector_flag         = True

run_all_knn_models(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag)

In [None]:
process_full_document_flag   = False
refined_candidates_only_flag = False
use_word_vector_flag         = True

run_all_knn_models(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag)

In [None]:
process_full_document_flag   = True
refined_candidates_only_flag = True
use_word_vector_flag         = True

run_all_knn_models(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag)

In [None]:
process_full_document_flag   = False
refined_candidates_only_flag = True
use_word_vector_flag         = True

run_all_knn_models(connection, process_full_document_flag, refined_candidates_only_flag, use_word_vector_flag)

In [None]:
max_score = 0.0

for r in results:
    (method, f_score, clf) = results[r]
    if f_score > max_score:
        max_method = method
        max_config = r
        max_score = f_score
        max_clf = clf
    print('%s: KNN classifier achieves the best f-score of %.3f' % (r, f_score))
print('\nBest overall KNN score was %.3f for %s ' %(max_score, max_config))
filename = 'best_knn_model.clf'
pickle.dump(max_clf, open(filename, 'wb'))

In [None]:
# Now that the classifier has been tested on labeled candidates, 
# it needs to be tested using candidates from the ground truth
# I had the option to get candidates from full doc, but polyNER
# downsamples the sentences, to evaluate polyNER, we should really
# only consider pipeline candidates
def get_ground_truth_candidates(full_doc):
    # read candidates from just fasttext sentences
    ground_truth_fulldoc_polymers = readcsv('../candidates/perdocformat/groundtruth_fulldocument_candidates.csv')
    
    # read candidates from just fasttext sentences
    ground_truth_pipeline_polymers = readcsv('../candidates/perdocformat/groundtruth_pipeline_candidates.csv')
    
    # Store candidates in dict
    candidates = {}
    
    if full_doc == 1:
        for poly in ground_truth_fulldoc_polymers:
            candidates[poly] = -1
    else:
        for poly in ground_truth_pipeline_polymers:
            candidates[poly] = -1
    return candidates

In [None]:
# Get vectors for polymer candidates (configurations are fulldoc vs pipeline, refined vs. unrefined)
def get_ground_truth_candidates_word_vectors_as_features(polymer_candidates,refined):
    features = {}
    Xl = [] # vectors
    ll = []
    for k, v in polymer_candidates.iteritems():
        if refined == 0:
            ll.append(k)
            features[k] = word_vectors[k]
            Xl.append(features[k])
        else:
            new_k = refine_candidate(k)
            if new_k != "ignore":
                ll.append(k)
                features[k] = word_vectors[k]
                Xl.append(features[k])
    return Xl, ll

In [None]:
def select_ground_truth_candidates_fulldocument_refined(full_document,refined_candidates):
    # Use entire document as opposed to fasttext selected sentences
    if full_document == 1:
        poly_candidates = get_ground_truth_candidates(1)
    else:
        poly_candidates = get_ground_truth_candidates(0)
        
    if refined_candidates == 0:
        X, l = get_ground_truth_candidates_word_vectors_as_features(poly_candidates,0)
    elif refined_candidates == 1:
        X, l = get_ground_truth_candidates_word_vectors_as_features(poly_candidates,1)
    return X,l

In [None]:
# Get external candidates = Get vectors for strings provided in a list that may or may not have a vector in the word2vec model
# Save a file with words that are out of vocabulary 
def get_external_candidates_vectors(input_file):
    
    ifl = open(input_file,'rb')
    output_file = input_file.split(".txt")[0]+"_oovocab.txt"
    
    ofl = open(output_file,'w+')
    
    candidates = []
    candidate_vectors = []
    oovocab = []
    lines = ifl.readlines()
    for line in lines:
        #print line
        candidate = u'%s' % cleanup_token(line.strip().strip("\n").decode('utf-8'))
        #print candidate
        if candidate not in vocabulary:
            if candidate not in oovocab:
                oovocab.append(candidate)
                ofl.write(candidate.encode('utf-8')+"\n")
        else:
            candidates.append(candidate)
            candidate_vectors.append(model.wv[candidate])
    ifl.close()
    ofl.close()
    print "Number of words out of vocab: ", len(oovocab)   # 1   in FastText 500 candidates
    print "Number of word in vocabulary:", len(candidates) # 499 in FastText 500 candidates
    return candidate_vectors, candidates

# Extend candidate word vectors with a second list of vectors and strings
# This function extend the current list of candidates with vectors from FastText CG2 word embedding model candidates
def extend_candidates(vectors1, names1, vectors2, names2):
    vectors1.extend(vectors2)
    names1.extend(names2)

# Strip tokens
def cleanup_token(tkn):
    new_tkn = tkn.strip().strip(',').strip('.')
    if len(new_tkn)>=2 and new_tkn[0] == '(' and new_tkn[len(new_tkn)-1]==')':
        new_tkn=new_tkn.rstrip(')').lstrip('(')
    return new_tkn

In [None]:
# Get predicted polymers from classifying ground truth polymers
def get_ground_truth_predicted_polymers(predicted_labels,names,filename):
    f = open(filename,'w+')
    for l in range(len(predicted_labels)):
        if predicted_labels[l]==1:
            polyname = names[l].encode('utf-8')
            f.write(polyname+"\n")
            #f.write(u"%s\n" % names[l].decode('utf-8'))
    f.close()

In [None]:
# Select unrefined candidates from the pipeline (generated the best classifier according to knn performance )
# fulldoc = 0, refined = 0
X2, labels2 = select_ground_truth_candidates_fulldocument_refined(0,0)

In [None]:
# Get vectors for FastText (string) candidates
# It turns out this doesn't help results (likely because vectors aren't computed in the same manner,
# otherwise, gensim would have also found more of these candidates)
# FTX, FTlabels = get_external_candidates_vectors('../candidates/listformat/FT_candidates_500.txt')
#extend_candidates(X2, labels2, FTX, FTlabels)

In [None]:
predicted_polys2 = max_clf.predict(X2)

In [None]:
# Used below when we extended candidates with FastText candidates
# get_GT_predicted_polymers(predicted_polys2,labels2,"classifierFT_pipeline_unrefined.txt") 
get_ground_truth_predicted_polymers(predicted_polys2,labels2,"knn_pipeline_unrefined.txt")
# FIXME: Import evaluation script here so that it can be done in this notebook as well
# instead of one notebook that evaluates several candidate files.

In [None]:
# Note that in the paper we used classifier 4 (pipeline refined) because in general the last step
# yielded the best precison and recall (see draft knn classifier notebook)

In [None]:
# Get the ground truth polymer names and try to classify them with the best classifier
GTX, GTlabels = get_external_candidates_vectors('../ground_truth/ground_truth_list_format.txt')
predicted_ground_truth_polys = max_clf.predict(GTX)
actual = [1 for count in range(len(predicted_ground_truth_polys))]
metrics(predicted_ground_truth_polys,actual)

Number of words out of vocab:  104
Number of word in vocabulary: 501
    Test points:     501
    True positives:  473
    False positive:  0
    True negatives:  0
    False negatives: 28
    Precision:       1.000
    Recall:          0.944
    F-1 score:       0.971