In [1]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
from core import semcor_bert_pipeline
from core.metrics import *

In [2]:
shared_metrics = {}
shared_words = ['foot.n', 'table.n', 'plane.n', 'right.n', 'model.n', 'degree.n']
fmt_sense_12 = lambda w: [w + '.01', w + '.02']
shared_senses = [fmt_sense_12(w) for w in shared_words]
shared_senses[-1] = ['degree.n.01', 'academic_degree.n.01']
shared_senses[2] = ['airplane.n.01', 'plane.n.02']
for w_s in zip(shared_words, shared_senses):
    model_data = binary_logistic(w_s[0], w_s[1])
    weight_values, weight_indices = nonzero_weights(model_data['model'])
    f_scores, accuracies, wrong_indices = k_fold_cv(model_data['data'], model_data['transformed_labels'])
    shared_metrics[w_s[0]] = {'senses': w_s[1], 'data': model_data, 'weights': weight_values,
                    'weight_indices': weight_indices, 'f1_kfold': f_scores, 'acc_kfold': accuracies,
                             'incorrect_indices': wrong_indices}
    

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


In [12]:
shared_stats = []
incorrect = []
for k in shared_metrics:
    word_results = shared_metrics[k]
    weights = word_results['weights']
    incorrect.append(misclassified_sentences(word_results['data'], word_results['incorrect_indices']))
    shared_stats.append({'word_type': k, 'avg_f1': np.mean(word_results['f1_kfold']), 'avg_acc': np.mean(word_results['acc_kfold']),
    'pct_nonzero_weights': len(weights) / 768, 'max_wt': max(weights), 'min_wt': min(weights), 'mean_nonzero_wt': np.mean(weights), 'sd_nonzero_wt': np.std(weights)})
pd.DataFrame(shared_stats)

Unnamed: 0,word_type,avg_f1,avg_acc,pct_nonzero_weights,max_wt,min_wt,mean_nonzero_wt,sd_nonzero_wt
0,foot.n,0.6,1.0,0.027344,0.299559,-0.370411,5.8e-05,0.162304
1,table.n,0.4,1.0,0.023438,0.235771,-0.217074,0.026023,0.135543
2,plane.n,0.353846,0.925,0.019531,0.455338,-0.292372,0.029935,0.190997
3,right.n,0.31,0.9,0.013021,0.183955,-0.701413,-0.048553,0.239379
4,model.n,0.35,0.833333,0.023438,0.392956,-0.228719,0.026879,0.157684
5,degree.n,0.8,1.0,0.016927,0.310806,-0.240092,-0.022596,0.146756


Misclassified senses (Only binary classification, 3/16 senses for math plane were misclassified)

In [15]:
pd.set_option('display.max_colwidth', 500)
pd.concat(incorrect)

Unnamed: 0,true_label,sentences
0,plane.n.02,"From the brightness of the F component of the solar corona and the brightness of the zodiacal light , an estimate of the particle sizes , concentrations , and spatial distribution can be derived for regions of space near the ecliptic plane ."
1,plane.n.02,We will refer to the plane of C and **f as the C-plane and to the plane of the graph as the f-plane .
2,plane.n.02,The roots of this equation are just the ordinates of the intersections of the graph of b with a straight line of unit slope through **f in the b-plane ( the plane of the graph of b ) .
0,right.n.02,At right is a casual style in a crushed unlined white leather .
1,right.n.02,"Only too often , however , you have the feeling that you are sitting in a room with some of the instruments lined up on one wall to your left and others facing them on the wall to your right ."
2,right.n.02,"With the first reports , Russell 's horse wheeled to the right and ran towards the buildings while Cook , followed by a hail of bullets , raced towards the arroyo of Salyer 's Canyon immediately in front of him , just reaching it as his horse fell ."
3,right.n.02,On their right rose the embankment covered with brush and trees .
4,right.n.02,"With the first reports , Russell 's horse wheeled to the right and ran towards the buildings while Cook , followed by a hail of bullets , raced towards the arroyo of Salyer 's Canyon immediately in front of him , just reaching it as his horse fell ."
0,model.n.01,The Glazer-Fine Arts edition ( Concert-Disc ) is a model of lucidity and organization .
1,model.n.01,"We shall not be able entirely to pass over these connections to the East as we consider Ripe Geometric pottery , the epic and the myth , and the religious evolution of early Greece ; the important point , however , is that these magnificent achievements , unlike those of later decades , were only incidentally influenced by Oriental models ."


I thought this might be useful to see if weights at similar positions were used.

In [212]:
[(k, shared_metrics[k]['weight_indices']) for k in shared_words]

[('foot.n',
  array([ 13,  29,  70,  89, 141, 158, 191, 226, 231, 287, 304, 308, 493,
         518, 532, 547, 637, 664, 693, 709, 730])),
 ('table.n',
  array([  9,  37,  49, 136, 143, 286, 308, 317, 332, 334, 393, 398, 445,
         471, 609, 619, 666, 680])),
 ('plane.n',
  array([105, 157, 254, 286, 308, 409, 411, 414, 448, 480, 513, 619, 637,
         685, 695])),
 ('right.n', array([ 58,  62,  65, 135, 184, 308, 338, 514, 606, 747])),
 ('model.n',
  array([ 74, 121, 161, 282, 308, 340, 364, 423, 434, 450, 452, 473, 525,
         552, 586, 619, 739])),
 ('degree.n',
  array([ 15,  22, 220, 262, 328, 350, 432, 523, 541, 544, 565, 620, 724]))]