# spaCy Tags

This notebook lists the [spaCy](https://spacy.io/) parts of speech tags, syntactic dependency relations, and stop words occurring in the CommonLit training dataset.

In [None]:
from matplotlib.pyplot import figure, title, xlabel, ylabel, scatter, legend, colorbar, plot
from pandas            import read_csv
from os                import walk
from os.path           import join
from spacy             import load, explain
from spacy             import displacy

# Load Data

## Data Dictionary

|Train|Public Test|Hidden Test|Description|
|--------------|--------------|----------|----------------------------------------------------|
|id|id|id|unique ID for excerpt|
|url_legal|url_legal|- |URL of source (Omitted from some records in the test set--see [note](https://www.kaggle.com/c/commonlitreadabilityprize/discussion/238670#1306025))|
|license|license |-|license of source material (Omitted from some records in the test set--see [note](https://www.kaggle.com/c/commonlitreadabilityprize/discussion/238670#1306025))|
|excerpt|excerpt|excerpt|text for predicting readability|
|target|-|-|readability|
|standard_error|-|-|Measure of spread of scores among multiple raters for each excerpt|

In [None]:
train_data    = None

for dirname, _, filenames in walk('/kaggle/input'):
    for filename in filenames:
        path_name = join(dirname, filename)
        if filename.startswith('train'):
            train_data = read_csv(path_name)
   

# Extract lists of tags, dependecies, and stop words.

In [None]:
nlp            = load("en_core_web_sm")  
POSs    = {}
STOPs   = {} 
DEPs    = set()
for text in train_data.excerpt:
    doc = nlp(text)
    for token in doc:
        if not token.pos_ in POSs:
            POSs[token.pos_] = set()
        POSs[token.pos_].add(token.tag_) 
        if token.is_stop:
            lower = token.lower_
            STOPs[lower] = (token.pos_,token.tag_)
        DEPs.add(token.dep_)


# Parts of Speech tags occurring in corpus

In [None]:
for POS in sorted(POSs.keys()):
    print(f'{POS}\t{explain(POS)}')
    for TAG in sorted(POSs[POS]):
        print(f'  {TAG}\t{explain(TAG)}')


# Stop words occurring in corpus

In [None]:
for term in sorted(list(STOPs.keys())):
    pos,tag = STOPs[term]
    print(f'{term:>10}\t{pos}\t{tag}')

# Dependencies found in corpus

In [None]:
for dep in sorted(list(DEPs)):
    print (f'{dep:>10}\t{explain(dep)}')