# Model Evaluation

This notebook includes code to evaluate models and conduct error analysis.

If some of the objects don't load properly, it's because they have been placed into a more organized directory structure (i.e., the raw data exists in `/data`, the pickled error dictionaries exist in `/evaluation/errors/dicts`, etc.

In [5]:
import gensim
import pandas as pd
import codecs
import re
import pickle
import wikipedia as wik

from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from collections import defaultdict

import itertools as it

# Black magic
import spacy 
from spacy.matcher import Matcher 
from spacy.attrs import *
nlp = spacy.load('en')

## Get Sentences from Wikipedia

In [2]:
wikipedia_skills = []
with open('clear_terms.txt', 'r') as infile:
    for line in infile:
        line = line.strip()
        wikipedia_skills.append(line)
print(len(wikipedia_skills))

593


In [14]:
%%time 
ambiguous_terms = []
clear_terms = {}
page_error_terms = []

for term in wikipedia_skills:
    try:
        text = wik.summary(term)
        sents = text.split('. ')
        summary = '. '.join(sents)
        clear_terms[term] = summary
    except wik.exceptions.DisambiguationError as e:
        ambiguous_terms.append(term)
        continue
    except wik.exceptions.PageError as e:
        page_error_terms.append(term)
        continue



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


CPU times: user 14.9 s, sys: 1.22 s, total: 16.1 s
Wall time: 4min 43s


In [15]:
# print(clear_terms["machine learning"][:150])
print("Ambiguous terms:", len(ambiguous_terms))
for term in ambiguous_terms:
    print("\t", term)
print("Page errors", len(page_error_terms))
for term in page_error_terms:
    print("\t", term)

Ambiguous terms: 3
	 ai
	 rdf
	 wix
Page errors 1
	 pypi


In [16]:
print(len(clear_terms.keys()))

589


In [17]:
with open('clear_terms_dict.pkl', 'wb') as f:
    pickle.dump(clear_terms, f)

In [3]:
# Read
with open('clear_terms_dict.pkl', 'rb') as f:
    clear_terms = pickle.load(f)

In [4]:
# Explore some sentences
ctr = 0
for key, value in clear_terms.items():
    ctr += 1
    if ctr < 5:
        print('*'*10, key, '\t', '*'*30)
        print(value, '\n\n')

********** .net framework 	 ******************************
.NET Framework (pronounced dot net) is a software framework developed by Microsoft that runs primarily on Microsoft Windows. It includes a large class library named Framework Class Library (FCL) and provides language interoperability (each language can use code written in other languages) across several programming languages. Programs written for .NET Framework execute in a software environment (in contrast to a hardware environment) named Common Language Runtime (CLR), an application virtual machine that provides services such as security, memory management, and exception handling. (As such, computer code written using .NET Framework is called "managed code".) FCL and CLR together constitute .NET Framework.
FCL provides user interface, data access, database connectivity, cryptography, web application development, numeric algorithms, and network communications. Programmers produce software by combining their source code with .N

## Replace skill bigrams with properly underscored skill bigrams

This is necessary for the model to correctly understand terms like "machine learning." For bigram skills, I replace spaces separating words with underscores.

In [5]:
import re 
import pickle

def multiple_replace(dict, text):
    text = text.lower()
    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

# print(multiple_replace(dict, text))

In [6]:
bigram_dict = {}
with open('skills_original.txt', 'r') as infile:
    for line in infile:
        line = line.strip()
        underscores = line.replace(' ', '_')
        bigram_dict[line] = underscores

In [7]:
input_text = '''
I like to do lots of Machine learning and data science.
I enjoy training neural networks and neural nets.
I fill my code repository with sandbags.
I like creative cloud, creative suite, destructive testing, and go lang.'''

print(multiple_replace(bigram_dict, input_text))


i like to do lots of machine_learning and data_science.
i enjoy training neural_networks and neural_nets.
i fill my code repository with sandbags.
i like creative cloud, creative suite, destructive_testing, and go_lang.


In [8]:
# Split into sentences that contain the skills in which we're interested
import pickle
import random
# Load up skills dictionary
with open('skill_dict.pkl', 'rb') as f:
    skill_dict = pickle.load(f)
    
skills_list = []
# Load list of skills to match
with open('skills_original.txt', 'r') as infile:
    for line in infile:
        line = line.strip()
        if line == '.net':
            continue # added 20APR to avoid confusion with .net items
        if line == 'vb.net':
            continue
        if line == 'ado.net':
            continue
        skills_list.append(line)

In [9]:
print(len(skills_list))

915


In [10]:
%%time
correct_sentences = []
for chunk in clear_terms.values():
    chunk = chunk.replace('.\n', '. ').lower().strip()
    sentences = chunk.split('. ')
    for s in sentences:
        for skill in skills_list:
            if skill in s.split():
                correct_sentences.append(s.replace('asp.net','')\
                                         .replace(' ado.net','').replace('asp.net','')\
                                        .replace('.net','')) #20APR18
                break
print("Number of sentences that contain hard skills:", len(correct_sentences))
print("\n\nSamples:\n",correct_sentences[145:149])

Number of sentences that contain hard skills: 2182


Samples:
 ['bootstrap aggregating, also called bagging, is a machine learning ensemble meta-algorithm designed to improve the stability and accuracy of machine learning algorithms used in statistical classification and regression', 'bagging is a special case of the model averaging approach.', 'balsamiq studios is an isv founded in march 2008 by peldi guilizzoni, a former adobe senior software engineer', 'the web-based balsamiq mockup tool was launched in june 2008']
CPU times: user 3.36 s, sys: 4.69 ms, total: 3.37 s
Wall time: 3.37 s


In [11]:
correct_sentences = list(set(correct_sentences))
len(correct_sentences)

2114

## Format sentences and return proper bigrams

In [86]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    12APR: adding numbers to this
    """
    return token.is_punct or token.is_space or token.is_digit

def format_text(text, bigram_dict):
    text = multiple_replace(bigram_dict, text)
    doc = nlp(text) 
    return u' '.join([token.lemma_ for token in doc
                         if not punct_space(token)])

def lemmatize_text(text):
    doc = nlp(text) 
    return u' '.join([token.lemma_ for token in doc
                         if not punct_space(token)])

# Necessary step to make the sentences exactly the same
def tokenize_text(text):
    tokenizer = RegexpTokenizer(r'\w+')
    bigram_input_list = tokenizer.tokenize(text)
    return ' '.join(bigram_input_list)

sent = 'i and you are machine learning and neural $ network users with 501(c)3 .net skills.'

print(format_text(sent, bigram_dict))
print(lemmatize_text(sent))


i and -PRON- be machine_learn and neural $ network user with 501(c)3 .net skill
i and -PRON- be machine learning and neural $ network user with 501(c)3 .net skill


In [54]:
sample = '''Correct: as of android studio 3.0 october kotlin be a fully support programming_language by google on the android operating_system and be directly include in the android studio 3.0 ide package as an alternative to the standard java compiler'''

print(sample)
print(tokenize_text(sample))

Correct: as of android studio 3.0 october kotlin be a fully support programming_language by google on the android operating_system and be directly include in the android studio 3.0 ide package as an alternative to the standard java compiler
Correct as of android studio 3 0 october kotlin be a fully support programming_language by google on the android operating_system and be directly include in the android studio 3 0 ide package as an alternative to the standard java compiler


In [81]:
from nltk import RegexpTokenizer
def corrupt_hard_skills(text):
    # Construct matcher object
    matcher = Matcher(nlp.vocab) 
    doc = nlp(text) 
    for label, pattern in skill_dict.items():
        matcher.add(label, None, pattern)
    # Compare input to pre-defined skill patterns
    user_skills = []
    matches = matcher(doc) 
    for match in matches:
        if match is not None:
            # match object returns a tuple with (id, startpos, endpos)
            output = str(doc[match[1]:match[2]]).lower()
            user_skills.append(output)
    
    num_hard_skills = len(user_skills)
    random_hard_skills = []
    for num in range(len(user_skills)):
        random_hard_skills.append(skills_list[random.randrange(len(skills_list))])
        
    # Make sure everything that needs to be a bigram is a bigram
    bigram_input = multiple_replace(bigram_dict, text)
    user_skills = [multiple_replace(bigram_dict, item) for item in user_skills]
    random_hard_skills = [multiple_replace(bigram_dict, item) for item in random_hard_skills]
    
    tokenizer = RegexpTokenizer(r'\w+')
    bigram_input_list = tokenizer.tokenize(bigram_input)
    
    output = []
    for word in bigram_input_list:
        if word not in user_skills:
            output.append(word)
        else:
            for index, skill in enumerate(user_skills):
                if word == skill:
                    output.append(random_hard_skills[index])
    return ' '.join(output)


input_str = 'i like to do machine learning, c, r, .net and neural network.'
sample = '''as of android studio 3.0 $20 october kotlin be a fully support programming_language by google on the android operating_system and be directly include in the android studio 3.0 ide package as an alternative to the standard java compiler
'''

# print(format_text(corrupt_hard_skills(input_str), bigram_dict))
print("\nRESULTS:",corrupt_hard_skills(sample))


RESULTS: as of android studio 3 0 20 october kubernetes be a fully support programming_language by google on the android operating_system and be directly include in the android studio 3 0 ide package as an alternative to the standard elastic compiler


In [101]:
## Proper processing for correct and incorrect strings
print(format_text(tokenize_text(sample), bigram_dict))
print('\n')
print(format_text(corrupt_hard_skills(sample), bigram_dict))

as of android studio october kotlin be a fully support programming_language by google on the android operating_system and be directly include in the android studio ide package as an alternative to the standard java compiler


as of android studio october statistical_package be a fully support programming_language by google on the android operating_system and be directly include in the android studio ide package as an alternative to the standard hypervisor compiler


In [98]:
%%time
correct_sentences_formatted = [format_text(tokenize_text(sentence), bigram_dict) for sentence in correct_sentences]

CPU times: user 30 s, sys: 1.6 s, total: 31.6 s
Wall time: 23.8 s


In [99]:
%%time 

incorrect_sentences_formatted = []
for sentence in correct_sentences:
    incorrect_sentences_formatted.append(format_text(corrupt_hard_skills(sentence), bigram_dict))

CPU times: user 1min 18s, sys: 3.22 s, total: 1min 21s
Wall time: 1min 4s


In [102]:
print(len(incorrect_sentences_formatted))
print(len(correct_sentences_formatted))

2114
2114


In [103]:
for index, item in enumerate(correct_sentences_formatted):
    if '.net' in item:
        print(item)
        print(incorrect_sentences_formatted[index])
        print('\n\n')

In [19]:
for i in range(50):
    print("Clean:", correct_sentences_formatted[i])
    print("Corrupted:", incorrect_sentences_formatted[i])
    print('\n\n')

In [106]:
# Write correct and incorrect sentences to disk, to save time later
with open('correct_2114_sentences_list.pkl', 'wb') as f:
    pickle.dump(correct_sentences_formatted, f)
with open('incorrect_2114_sentences_list.pkl', 'wb') as f:
    pickle.dump(incorrect_sentences_formatted, f)

In [3]:
# Load them back up
# Write correct and incorrect sentences to disk, to save time later
with open('correct_2114_sentences_list.pkl', 'rb') as f:
    correct_sentences_formatted = pickle.load(f)
with open('incorrect_2114_sentences_list.pkl', 'rb') as f:
    incorrect_sentences_formatted = pickle.load(f)

## Test probabilities of sentence pairs

In [9]:
word2vec_filepath = 'models/word2vec_hs1_neg0'
# load the finished model from disk
skill2vec = Word2Vec.load(word2vec_filepath)
skill2vec.init_sims()
print(u'Model loaded with {} training epochs.'.format(skill2vec.train_count))

Model loaded with 2 training epochs.


In [11]:
def test_models(skill2vec_model, correct_sentence_list, incorrect_sentence_list, num_total_sentences=8633141):
    '''
    Ingest a list of (properly formatted) correct sentences and incorrect sentences.
    For each list, score the negative log likelihood of each sentence.
    Compare the averages, and output the correct average minus the incorrect average.
    Higher results indicate a better model.
    '''
    correct_score = 0.0
    for sentence in correct_sentence_list:
        correct_score += -1*(skill2vec_model.score([sentence.split()], total_sentences=num_total_sentences))
    correct_avg = correct_score/len(correct_sentence_list)
    
    incorrect_score = 0.0
    for sentence in incorrect_sentence_list:
        incorrect_score += -1*(skill2vec_model.score([sentence.split()], total_sentences=num_total_sentences))
    incorrect_avg = incorrect_score/len(incorrect_sentence_list)
    return incorrect_avg[0] - correct_avg[0]

In [24]:
# Error analysis
sample_good = ['kubernete commonly stylize as k8s be an open source system for automate deployment scaling and management of containerized application that be originally design by google and now maintain by the cloud native computing foundation']
sample_bad = ['google_compute_engine stylize as k8s be an open source system for automate deployment scaling and management of ramdajs application that be originally design by google and now maintain by the cloud native computing foundation']
test_models(skill2vec, sample_good, sample_bad)

-329.62134

In [146]:
# %%time
# print(test_models(skill2vec, correct_sentences_formatted, incorrect_sentences_formatted))

## Run evaluation on all trained models

In [116]:
# Split into full text models and JD-only models. 

directory = 'models/'

full_text_models = ['word2vec_hs1_neg0',
                    'word2vec_hs1_neg0_cbow',
                    'word2vec_hs1_neg0_size50_cbow',
                    'word2vec_hs1_neg0_size200',
                    'word2vec_hs1_neg0_window_8',
                    'word2vec_size_400_hs1_neg0',
                    'word2vec_size_600_hs1_neg0', 
                    'word2vec_hs1_neg0_window_10_size300',
                    'word2vec_hs1_neg0_window_12_size600',
                   'word2vec_hs1_neg0_window_15_size300']

In [117]:
%%time

model_performance = {}

# Full text models have 8633141 (the default) sentences
for model in full_text_models:
    path = directory + model
    skill2vec_model = Word2Vec.load(path)
    model_performance[model] = test_models(skill2vec_model, 
                             correct_sentences_formatted, 
                             incorrect_sentences_formatted)

# Print out sorted model values

with open('errors/performance.txt', 'w') as outfile:
    s = [(k, model_performance[k]) for k in sorted(model_performance, 
                                                       key=model_performance.get, reverse=True)]

    for k, v in s:
        out = str(k) + '\t' + str(v) + '\n'
        outfile.write(out)

CPU times: user 4min 11s, sys: 3min 4s, total: 7min 16s
Wall time: 6min 50s


## Keep track of the skills associated with failing comparisons

In [118]:
%%time

post_processing_skills = []

for val in bigram_dict.values():
    processed = lemmatize_text(val)
    post_processing_skills.append(processed)

CPU times: user 6.57 s, sys: 397 ms, total: 6.96 s
Wall time: 5.76 s


In [119]:
skill_underscores = [item.replace(' ', '_') for item in skills_list]
exhaustive_skills = skill_underscores + post_processing_skills

In [120]:
def find_hard_skills(text):
    '''
    Input a text, output a list of hard skills.
    '''
    skills = []
    for word in text.split():
        if word in exhaustive_skills:
            skills.append(word)
    return(skills)

In [121]:
def analyze_sentence_fails(skill2vec_model, correct_sentence_list, 
                      incorrect_sentence_list, num_total_sentences=8633141):
    '''
    Ingest a list of (properly formatted) correct sentences and incorrect sentences.
    For each list, score the negative log likelihood of each sentence.
    Output the correct sentence in any situation where the incorrect 
    sentence scores better than the correct one.
    output format: tuple of (correct_sentence, incorrect_sentence)
    '''

    mistakes_dict = {}
    num_mistakes = 0
    correct_mistakes = []
    incorrect_mistakes = []
    
    for index in range(len(correct_sentence_list)):
        correct_score = -1*(skill2vec_model.score([correct_sentence_list[index].split()], 
                                                  total_sentences=num_total_sentences))
        incorrect_score = -1*(skill2vec_model.score([incorrect_sentence_list[index].split()], 
                                                    total_sentences=num_total_sentences))
        
        if incorrect_score < correct_score:
            correct_skills = find_hard_skills(correct_sentence_list[index])
            incorrect_skills = find_hard_skills(incorrect_sentence_list[index])
#             pairs.append((correct_skills, incorrect_skills))
            mistakes_dict[index] = (correct_skills, incorrect_skills)
            num_mistakes += 1
            correct_mistakes.append(correct_sentence_list[index])
            incorrect_mistakes.append(incorrect_sentence_list[index])
    return(mistakes_dict, num_mistakes, correct_mistakes, incorrect_mistakes)

In [122]:
%%time

for model in full_text_models:
    path = 'models/' + model
    skill2vec_model = Word2Vec.load(path)

    mistakes_dict, num_mistakes, correct_mistakes, incorrect_mistakes  = analyze_sentence_fails(skill2vec_model, 
                                                                                                correct_sentences_formatted,
                                                                                                incorrect_sentences_formatted)

    
    output_file = 'errors/dicts/' + model + '_dict.pkl'
    with open(output_file, 'wb') as f:
        pickle.dump(mistakes_dict, f)
        
        
    # Write confused items to file
    pairs_counter = defaultdict(int)

    for item in mistakes_dict.values():
        pairs_counter[str(item)] += 1

    pairs_counter.default_factory = None
    
    outpath = 'errors/confusion/' + model + '.txt'
    with open(outpath, 'w') as outfile:
        s = [(k, pairs_counter[k]) for k in sorted(pairs_counter, key=pairs_counter.get, reverse=True)]
        for k, v in s:

            output = k + '\t' + str(v) + '\n'
            outfile.write(output)

    print(model, len(mistakes_dict.items()))

word2vec_hs1_neg0 468
word2vec_hs1_neg0_cbow 361
word2vec_hs1_neg0_size50_cbow 398
word2vec_hs1_neg0_size200 443
word2vec_hs1_neg0_window_8 440
word2vec_size_400_hs1_neg0 416
word2vec_size_600_hs1_neg0 411
word2vec_hs1_neg0_window_10_size300 438
word2vec_hs1_neg0_window_12_size600 425
word2vec_hs1_neg0_window_15_size300 439
CPU times: user 4min 15s, sys: 2min 59s, total: 7min 14s
Wall time: 6min 48s


In [123]:
# idx = -1
# for k, v in mistakes_dict.items():
    
#     idx += 1
#     if len(v[0]) != len(v[1]):
#         print(idx, '*'*40)
#         print(k, v)
#         print("Correct:",correct_mistakes[idx])
#         print("Incorrect:", incorrect_mistakes[idx])
#         print('\n\n')

## Create a confusion matrix 

### (or, at least, a defaultdict counter object that allows us to analyze confused pairs)

In [124]:
pairs_counter = defaultdict(int)

for item in mistakes_dict.values():
    pairs_counter[str(item)] += 1

pairs_counter.default_factory = None

In [125]:
print(len(pairs_counter.items()))

In [126]:
# Show the results

s = [(k, pairs_counter[k]) for k in sorted(pairs_counter, key=pairs_counter.get, reverse=True)]

for k, v in s[:100]:
    output = k + '\t' + str(v)
    print(output)

In [127]:
# Write most commonly mistaken words to file
model_name = 'word2vec_hs1_neg0_window_15_size300'
path = 'errors/' + model_name + '.txt'

s = [(k, pairs_counter[k]) for k in sorted(pairs_counter, key=pairs_counter.get, reverse=True)]

with open(path,'w') as outfile:
    for k, v in s[:100]:
        output = k + '\t' + str(v)
        outfile.write(output)

### Construct counter for most often confused terms (not pairs)

In [18]:
skills_counter = defaultdict(int)

# Read in an error dict

with open('errors/dicts/baseline_skill_texts.txt', 'r') as f:
    for line in f:
        line = line.strip()
        skills_counter[line] += 1

# Convert to regular dict
skills_counter.default_factory = None

# print(skills_counter['net'])

# Sort by frequency
s = [(k, skills_counter[k]) for k in sorted(skills_counter, key=skills_counter.get, reverse=True)]
for k, v in s[:20]:
    print(k, v)

model_name = 'word2vec_hs1_neg0'
path = 'errors/dicts/' + model_name + '_skills_errors_frequency.txt'

with open(path,'w') as outfile:
    for k, v in s:
        output = k + '\t' + str(v) + '\n'
        outfile.write(output)


c 16
programming_language 13
sql 11
stress_load 9
gnu 9
linux 8
java 7
silverlight 7
cobol 7
operating_system 7
powershell 7
statistic 7
regression 7
datamining 7
dax 6
yardis 6
btrieve 6
apache 6
wxwidget 6
citrix 6


### Examine magnitude of mistakes

In [128]:
%%time

import numpy as np

def top_magnitude_mistakes(skill2vec_model, 
                             correct_sentence_list, 
                             incorrect_sentence_list, 
                             num_total_sentences=8633141):
    '''
    Find the highest magnitude mistakes, 
    as magnitude seems to play more of a role than 
    quantity in the scoring scheme.
    '''
    correct_scores = []
    for sentence in correct_sentence_list:
        correct_score = -1*(skill2vec_model.score([sentence.split()], total_sentences=num_total_sentences))[0]
        correct_scores.append(correct_score)
    
    incorrect_scores = []
    for sentence in incorrect_sentence_list:
        incorrect_score = -1*(skill2vec_model.score([sentence.split()], total_sentences=num_total_sentences))[0]
        incorrect_scores.append(incorrect_score)
    
    return correct_scores, incorrect_scores
    
cor_top, inc_top = top_magnitude_mistakes(skill2vec_model, correct_sentences_formatted, incorrect_sentences_formatted)


CPU times: user 27.8 s, sys: 19.1 s, total: 46.9 s
Wall time: 43.2 s


In [129]:
diffs = []

for i in range(len(cor_top)):
    diffs.append(inc_top[i] - cor_top[i])

top_diffs = np.argsort(diffs)

for i in top_diffs[:10]:
    print("Difference:", diffs[i])
    print("Correct:", correct_sentences_formatted[i])
    print("Incorrect:", incorrect_sentences_formatted[i])
    print('\n\n')

In [130]:
# # Write most commonly mistaken words to file
model_name = 'word2vec_hs1_neg0_window_15_size300'
path = 'errors/' + model_name + '.txt'

s = [(k, pairs_counter[k]) for k in sorted(pairs_counter, key=pairs_counter.get, reverse=True)]

with open(path,'w') as outfile:
    for k, v in s[:100]:
        output = k + '\t' + str(v)
        outfile.write(output)

top_diffs = np.argsort(diffs)

for i in top_diffs[:10]:
    print("Difference:", diffs[i])
    print("Correct:", correct_sentences_formatted[i])
    print("Incorrect:", incorrect_sentences_formatted[i])
    print('\n\n')

In [131]:
%%time

for model in full_text_models:
    path = 'models/' + model
    skill2vec_model = Word2Vec.load(path)

    cor_top, inc_top = top_magnitude_mistakes(skill2vec_model, 
                                              correct_sentences_formatted, 
                                              incorrect_sentences_formatted)
    
    diffs = []
    for i in range(len(cor_top)):
        diffs.append(inc_top[i] - cor_top[i])
    top_diffs = np.argsort(diffs)

    output_file = 'errors/big_error_sents/' + model + '.txt'
    with open(output_file,'w') as outfile:
        for index, i in enumerate(top_diffs[:50]):
            diff = diffs[i]
            cor_sent = correct_sentences_formatted[i]
            inc_sent = incorrect_sentences_formatted[i]
            header = "Error " + str(index) + '. ' + str(diff) + '\n'
            cor_text = "Correct: " + cor_sent + '\n'
            inc_text = "Incorrect: " + inc_sent + '\n'
            outfile.write(header)
            outfile.write(cor_text)
            outfile.write(inc_text)
            outfile.write('\n')

CPU times: user 4min 15s, sys: 3min 7s, total: 7min 23s
Wall time: 6min 56s
