# This notebook compares word complexity scores assigned by Amazon Mechanical Turk annotators (Kriz dataset) to actual substitutions made by Newsela specialists when they adapted articles to lower reading levels
# Imports, Constants

In [1]:
from glob import glob
from tqdm import tqdm
import numpy as np
import re
import sys
from nltk.corpus import stopwords
from aligner import align
from utils import tokenize
from man_auto_comparison import *

# Loading and Analyzing the Datasets

In [2]:
dataset = load_dataset()
kriz_dataset_stats(dataset)

Loading Kriz Dataset...


100%|██████████| 16293/16293 [00:00<00:00, 221673.72it/s]

Wrong scores encountered 6 times. These are [11, 11, 15, 11, 11, 11]
Total words annotated: 16285, among them 3348 (20.0 percent) are complex, i.e. have score of 3 or higher
Distribution of scores in percentages goes as follows:
0	1	2	3	4	5	6	7	8	9	10
61.0	11.0	6.0	4.0	3.0	3.0	2.0	2.0	2.0	1.0	0.0
Mean is 1.35, median is 0, standard deviation is 2.32





In [3]:
aligned_data = load_aligned_data()

Indexing aligned data...


100%|██████████| 19198/19198 [00:11<00:00, 1718.08it/s]


# Comparing the complexity scores

In [4]:
dataset = load_dataset()
kriz_dataset_stats(dataset)
aligned_data = load_aligned_data()
score_matrix = np.zeros((3, 11), dtype=int)
aligned_total = 0
print("Processing Kriz dataset...")
sys.stdout.flush()
for sentence in tqdm(dataset):
    curr_matrix, aligned = process_sentence(sentence["tokens"],
                                                 sentence["token_scores"],
                                                 aligned_data)
    aligned_total += aligned
    score_matrix += curr_matrix

Loading Kriz Dataset...


100%|██████████| 16293/16293 [00:00<00:00, 217872.90it/s]

Wrong scores encountered 6 times. These are [11, 11, 15, 11, 11, 11]
Total words annotated: 16285, among them 3348 (20.0 percent) are complex, i.e. have score of 3 or higher
Distribution of scores in percentages goes as follows:
0	1	2	3	4	5	6	7	8	9	10
61.0	11.0	6.0	4.0	3.0	3.0	2.0	2.0	2.0	1.0	0.0
Mean is 1.35, median is 0, standard deviation is 2.32
Indexing aligned data...



100%|██████████| 19198/19198 [00:11<00:00, 1725.77it/s]


Processing Kriz dataset...


100%|██████████| 1738/1738 [2:23:23<00:00,  4.95s/it]  


In [7]:
sys.stdout.flush()
print("%d sentences total, %d aligned (%2.1f percent)." %
      (len(dataset), aligned_total, 100 * aligned_total / len(dataset)))
print("Word aligner failed to extract word-to-word alignment\n"
      "for %.2f percent of unidentical sentence alignments" %
      (100 * ALIGNMENT_STATS["unsuccessful"] / ALIGNMENT_STATS["total"]))

print("Kriz complexity score distribution for different alignment situations:")
print("               " + "\t".join([str(i) for i in range(11)]))
print("Word kept      " + "\t".join([str(round(100 * score_matrix[SIMPLE][i]/np.sum(score_matrix[SIMPLE]), 1)) 
                                     for i in range(11)]))
print("Word removed:  " + "\t".join([str(round(100 * score_matrix[UNK][i]/np.sum(score_matrix[UNK]), 1)) 
                                     for i in range(11)]))
print("Word replaced: " + "\t".join([str(round(100 * score_matrix[COMPLEX][i]/np.sum(score_matrix[COMPLEX]), 1)) 
                                     for i in range(11)]))

print("Word kept total: %d, word removed total: %d, word replaced total: %d" % (
    np.sum(score_matrix[SIMPLE]), np.sum(score_matrix[UNK]), np.sum(score_matrix[COMPLEX])))

ts = np.repeat(np.arange(11), score_matrix[SIMPLE, :])
print("If word is simple when it is kept, scores for simple words have:")
print("the mean of %.2f, the median of %d, standard deviation of %.2f." %
      (ts.mean(), np.median(ts), ts.std()))
tc = np.repeat(np.arange(11), score_matrix[COMPLEX, :])
print("If word is complex when it is replaced with another word, scores for complex words have:")
print("the mean of %.2f, the median of %d, standard deviation of %.2f." %
      (tc.mean(), np.median(tc), tc.std()))
tu = np.repeat(np.arange(11), score_matrix[UNK, :])
print("For other words, the scores have:")
print("the mean of %.2f, the median of %d, standard deviation of %.2f." %
      (tu.mean(), np.median(tu), tu.std()))

1738 sentences total, 1572 aligned (90.0 percent).
Word aligner failed to extract word-to-word alignment
for 2.00 percent of unidentical sentence alignments
Kriz complexity score distribution for different alignment situations:
               0	1	2	3	4	5	6	7	8	9	10
Word kept      65.0	11.0	5.0	4.0	3.0	2.0	2.0	1.0	1.0	1.0	0.0
Word removed:  48.0	10.0	6.0	5.0	5.0	4.0	5.0	5.0	5.0	3.0	0.0
Word replaced: 56.0	11.0	6.0	5.0	4.0	3.0	3.0	2.0	2.0	2.0	0.0
Word kept total: 10766, word removed total: 1797, word replaced total: 2037
If word is simple when it is kept, scores for simple words have:
the mean of 1.14, the median of 0, standard deviation of 2.10.
If word is complex when it is replaced with another word, scores for complex words have:
the mean of 1.61, the median of 0, standard deviation of 2.46.
For other words, the scores have:
the mean of 2.27, the median of 1, standard deviation of 2.94.
