# Test KNN classifier on classifying every word in all 100 ground-truth documents (documents from which experts manually extracted polymer names)


In [None]:
# -*- coding: utf-8 -*-
import os
import re
import sys
import csv
import math
import time
import spacy
import sklearn
import pickle
import numpy as np
import pandas as pd
import sqlite3
import scipy as sp
import cPickle as pkl
from __future__ import division
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import gensim, logging
# This log shows progress and is very useful
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#reload(sys)
#sys.setdefaultencoding("utf-8")

from sklearn import svm
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn import neighbors, datasets
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit

from sklearn.ensemble import RandomForestClassifier

# Load spacy for candidate processing
nlp = spacy.load("en_core_web_sm")
# Adding these into the vocabulary
nlp.vocab[u"diblock"].is_stop = False
nlp.vocab[u"g/mol"].is_stop = False
nlp.vocab[u"kg/mol"].is_stop = False

In [None]:
# Connect to db
def connect_to_db():
    database = "../../db/sentences.db"
    conn = create_connection(database)
    return conn

In [None]:
# Connect to DB
def create_connection(db_file):
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return None

In [None]:
# Check if a string is a number
def is_number(n):
    n=n.strip().rstrip(" g").rstrip(" K")
    # Strip units
    units = [u'S/cm',u'mC/cm2',u'cm2/C',u'°C',u'°C/min',u'mL',u'mol',u'mmol',u'mg/mL',u"mL",
             u'eV',u'°',u'g/mol',u'mL/min',u'μL',u'μm',u'cm–1',u'cm–2',u'mol−1',u"MPa1/2",u"cm2", u"V−1", u"s−1",u"MHz"]

    #for unit in units:
    #    n = n.rstrip(unit)
    #if n == "":
    #    return False
    un = u'%s' % (n)
    if any(unit in un for unit in units):
        return True

    # Signs
    signs = [u"+",u"<",u">",u"∼",u"~",u"%",u"±",u"≥",u"≤",u"≈",u"−",u"-"]
    try:
        float(n)
        return True
    except ValueError:
        if len(n)>1 and n[0] in signs:
            return True
        if len(n)>0 and n[len(n)-1]=="%":
            return True
        return False
    
# Check if a string is a range   
def is_range(n):
    n= n.strip().rstrip(" g").rstrip(" K")
    if n.find(u"–") != -1:
        bits = n.split(u"–")
    elif n.find(u"−")!=-1:
        bits = n.split(u"−")
    elif n.find(u"-")!=-1:
        bits = n.split(u"-")
    else:
        return False
    for bit in bits:
        if is_number(bit) == False:
            return False
    return True

# Check if the token is made up of numbers separated by punctuation
def is_all_numbers(n):
    bits = re.split(' |:|;|-|,|;|\.',n)
    #bits = re.split(",.:")
    for bit in bits:
        if is_number(bit) == False:
            return False
    return True


# First use vectors as features
def refine_candidate(candidate):
    # Top context words in DB
    frequent_context_words = ["weight","material","system","chains","samples", "systems","Tg","weights","comparison",
                              "chromatography","Mn","THF","toluene","GPC","chloroform","index","Column","columns",
                              "standards","reference","segments","polydispersity","substrate","block","components",
                              "permeation","component","Mw","bulk","standard","calibration","dynamics","cross-linked",
                              "cells","domains","segment","mixtures","densities","substrates","well-defined","silica",
                              "SEC","particles","compositions","surfaces","linear","blend","blends"]
    
    # Polymers that exist in the English dictionary and should not be removed as unrelated English words
    common_polys = ['polyethylene', 'polyurethane', 'polypropylene', 'polyester', 'PS', 'polystyrene', 'PLA', 'PI', 
                    'PET', 'PVP', 'PEG', 'cellulose', 'PAN', 'methyl'] #These are polymers that could appear within spacy vocab
    common_polys = [polymer.lower() for polymer in common_polys] 
    
    # Filter out junk values; junk values are
    # a. Numbers or range of numers
    # b. Long sentences withing parenthesis which weren't broken by our special tokenizer, this is done by recognizing
    #    words inside such sentence as an English vocabulary 
    # c. Remove top context words that are related to polymers in the database but have been marked as "not polymers" (count>20)
     
    if len(candidate)==1:
        return "ignore"
        
    # Remove words that are in the English vocabulary
    if (candidate in nlp.vocab) and candidate.lower() not in common_polys:
        return "ignore"
    
    # We can get the count on the fly but didn't use it here, we used the count in our database instead
    # We remove the most frequent context words found in the annotated data
    # vocab_obj = model.wv.vocab[candidate]
    # freq= vocab_obj.count
    if candidate.lower() in frequent_context_words:
        return "ignore"
            
    # Remove numbers
    if is_number(candidate) == True or is_range(candidate) == True or is_all_numbers(candidate) == True:
        return "ignore"

    # Remove long sentences within parenthesis
    junk = False

    items = re.split(' |:|;|-',candidate)
    for item in items:
        #Removing items that are sentences within  parenthesis
        if item != "poly" and is_number(item)==False and ("standard" in item or (item in nlp.vocab and item not in common_polys)):
            junk = True
            break
    if junk is True:
        return "ignore"
    
    return candidate

In [None]:
# Clean up token
def cleanup_token(tkn):
    new_tkn = tkn.strip(' .,.;')
    if len(new_tkn)>=2 and new_tkn[0] == '(' and new_tkn[len(new_tkn)-1]==')':
        new_tkn=new_tkn.rstrip(')').lstrip('(')
    return new_tkn

 # Filter using annotated data from pipeline or full documents
def get_sentences_from_db(conn, doi):
    sentences = []
    cur = conn.cursor()
    cur.execute('select sentence from sentences where docid="%s"' % doi) 
    rows = cur.fetchall()
    for row in rows:
        sentences.append(row[0]) 
    return sentences  


def read_documents(conn,ground_truth_dois):
    global full_list_of_candidates
    global seen_words
    global full_list_of_probabilities
    f = open(ground_truth_dois)
    lines = f.readlines()
    polymers = {}
    probabilities = {}
    count = 0
    #seen_words = []
    for line in lines:
        doi = line.strip("\n")
        polymers[doi]=[]
        probabilities[doi]=[]
        sentences = get_sentences_from_db(conn,doi)
        words = []
        
        word_vectors = []
        for sentence in sentences:
            # Get spacy tokens and where tokens match - check POS to keep only nouns
            # Create dictionary to check in
            check_pos = {}
            doc = nlp(sentence.rstrip("."))
            for token in doc:
                check_pos[token.text] = token.pos_
            
            
            tokens = re.split(r"\s+(?=[^()]*(?:\(|$))", sentence.rstrip("."))
            new_tokens = list(map(lambda token:cleanup_token(token),tokens))
            for new_token in new_tokens:
                if new_token in seen_words or new_token in full_list_of_candidates:
                    continue
                    
                if new_token in check_pos.keys():
                    if not(check_pos[new_token] == "NOUN" or check_pos[new_token] == "PROPN"):
                        continue   
                        
                if new_token in vocabulary:
                    if new_token == "":
                        continue
                    refined_token = refine_candidate(new_token)
                    if refined_token == "ignore":
                        continue
                    if is_number(new_token)==True:
                        continue
                    if is_number(new_token[:-1].strip())==True:
                        continue
                    if is_number(new_token[1:].strip())==True:
                        continue
                    if len(new_token)>2 and is_number(new_token[1:-1].strip())==True:
                        continue
                    if is_range(new_token)==True:
                        continue
                    if is_all_numbers(new_token)==True:
                        continue
                    # Keep track of seen vectors sp we don't have to read them again.
                    # Alternatively, I could get a list of unique tokens and generate the vecors
                    words.append(new_token)
                    seen_words.append(new_token)
                    word_vectors.append(model.wv[new_token])
                
        #predicted_labels = best_knn_clf.predict_proba(word_vectors)
        predicted_labels = best_knn_clf.predict(word_vectors)
        #print predicted_labels
        count_words = 0
        for label in predicted_labels:
            #if label == 1:
            polymers[doi].append(words[count_words])
            probabilities[doi].append(label)
            #probabilities[doi].append([label[0],label[1]])
            #print words[count_words]
            count_words = count_words + 1
        print "Doi #%s: %s and number of candidates: %s" % (count, doi, len(polymers[doi]))
        count = count + 1

        # Extend full list of candidates by polymers
        full_list_of_candidates.extend(polymers[doi])
        full_list_of_probabilities.extend(probabilities[doi])
        
        #full_list_of_candidates = list(set(full_list_of_candidates))
    return polymers

In [None]:
# Keep a list of unique candidates and a list of seen words because words have the same vectors
full_list_of_candidates = []
full_list_of_probabilities = []
seen_words = []
# Load model
best_knn_clf = pickle.load(open('pipeline_refined_model.clf', 'rb')) # used for paper
#best_knn_clf = pickle.load(open('best_knn_model.clf')) # Trying this classifier instead (pipeline unrefined =  best fscore)

In [None]:
# Load the pre trained gensim model
model = gensim.models.Word2Vec.load('../../models/gensim_cbow.bin')

In [None]:
# Get word vectors
word_vectors = model.wv
# Get vocab
vocabulary = list(model.wv.vocab)

In [None]:
# Connects to db
connection = connect_to_db()

In [None]:
# Time execution
start_time = time.time()

# Get candidates from the ground truth documents
candidates = read_documents(connection, '../../data/ground_truth_dois.txt')

end_time = time.time() - start_time
print "Read all documents in %.2f" % (end_time)

Read all documents in 5761.04

In [None]:
#reload(sys)
#sys.setdefaultencoding('utf8')
#f = open('results.txt','w+')
#for candidate in full_list_of_candidates:
#    line = u'%s\n' % candidate
#    f.write(line)
#f.close()

In [None]:
#f = open('prediction_probabilities.txt','w+')
f = open('noun_prediction.txt','w+')
for count in range(len(full_list_of_candidates)):
    polymer = full_list_of_candidates[count].encode('utf-8')
    label = full_list_of_probabilities[count]
    line = '%s\t%s\n' % (label, polymer)
    #prob_no = full_list_of_probabilities[count][0]
    #prob_yes = full_list_of_probabilities[count][1]
    #line = '%s\t%s\t%s\n' % (prob_no, prob_yes,polymer)
    f.write(line)
f.close()

In [None]:
def write_results_to_csv(dictionary, csv_basename):
    csv_filename = csv_basename 
    with open(csv_filename, 'w+') as resultFile:
    
        wr = csv.writer(resultFile, dialect='excel')
        for key, values in dictionary.iteritems():
            #if len(values) == 0:
            #    continue
            polymers = []
            polymers.append(key)
            for value in values:
                polymers.append(value.encode('utf-8'))
            wr.writerow(polymers)
    return

In [None]:
#write_results_to_csv(candidates,'../candidates/perdocformat/classifier_candidates.csv')