In [1]:
# Import spacy
import spacy

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('data'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('data' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [3]:
print(file_names)

['poe_the assignation.txt', 'poe_the black cat.txt', 'poe_the masque of the red death.txt', 'poe_the tell-tale heart.txt']


In [4]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [5]:
# Turn dictionary into a dataframe
poe_df = pd.DataFrame(d)

In [6]:
poe_df.head()

Unnamed: 0,Filename,Text
0,poe_the assignation.txt,THE ASSIGNATION\n\n\n Stay for me there! I...
1,poe_the black cat.txt,"THE BLACK CAT.\n\n\n For the most wild, y..."
2,poe_the masque of the red death.txt,THE MASQUE OF THE RED DEATH.\n\n\n The “R...
3,poe_the tell-tale heart.txt,THE TELL-TALE HEART.\n\n\n True!—nervous—...


In [7]:
# Remove extra spaces from papers
poe_df['Text'] = poe_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
poe_df.head()

Unnamed: 0,Filename,Text
0,poe_the assignation.txt,THE ASSIGNATION Stay for me there! I will not ...
1,poe_the black cat.txt,"THE BLACK CAT. For the most wild, yet most hom..."
2,poe_the masque of the red death.txt,THE MASQUE OF THE RED DEATH. The “Red Death” h...
3,poe_the tell-tale heart.txt,"THE TELL-TALE HEART. True!—nervous—very, very ..."


In [8]:
# Load metadata.
metadata_df = pd.read_csv('metadata.csv')
metadata_df.head()

Unnamed: 0,Filename,Author,Title,Language,Genre,First published
0,poe_the assignation,Edgar Allan Poe,The Assignation,English,gothic literature,1834
1,poe_the black cat,Edgar Allan Poe,The Black Cat,English,gothic literature,1843
2,poe_the masque of the red death,Edgar Allan Poe,The Masque of the Red Death,English,gothic literature,1842
3,poe_the tell-tale heart,Edgar Allan Poe,The Tell-Tale Heart,English,gothic literature,1843


In [9]:
# Remove .txt from title of each paper
poe_df['Filename'] = poe_df['Filename'].str.replace('.txt', '', regex=True)

In [10]:
# Merge metadata and short stories into new DataFrame
poemeta_df = metadata_df.merge(poe_df,on='Filename')

In [11]:
poemeta_df

Unnamed: 0,Filename,Author,Title,Language,Genre,First published,Text
0,poe_the assignation,Edgar Allan Poe,The Assignation,English,gothic literature,1834,THE ASSIGNATION Stay for me there! I will not ...
1,poe_the black cat,Edgar Allan Poe,The Black Cat,English,gothic literature,1843,"THE BLACK CAT. For the most wild, yet most hom..."
2,poe_the masque of the red death,Edgar Allan Poe,The Masque of the Red Death,English,gothic literature,1842,THE MASQUE OF THE RED DEATH. The “Red Death” h...
3,poe_the tell-tale heart,Edgar Allan Poe,The Tell-Tale Heart,English,gothic literature,1843,"THE TELL-TALE HEART. True!—nervous—very, very ..."


In [12]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

In [13]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [14]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
poemeta_df['Doc'] = poemeta_df['Text'].apply(process_text)

In [15]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [16]:
# Run the token retrieval function on the doc objects in the dataframe
poemeta_df['Tokens'] = poemeta_df['Doc'].apply(get_token)
poemeta_df

Unnamed: 0,Filename,Author,Title,Language,Genre,First published,Text,Doc,Tokens
0,poe_the assignation,Edgar Allan Poe,The Assignation,English,gothic literature,1834,THE ASSIGNATION Stay for me there! I will not ...,"(THE, ASSIGNATION, Stay, for, me, there, !, I,...","[THE, ASSIGNATION, Stay, for, me, there, !, I,..."
1,poe_the black cat,Edgar Allan Poe,The Black Cat,English,gothic literature,1843,"THE BLACK CAT. For the most wild, yet most hom...","(THE, BLACK, CAT, ., For, the, most, wild, ,, ...","[THE, BLACK, CAT, ., For, the, most, wild, ,, ..."
2,poe_the masque of the red death,Edgar Allan Poe,The Masque of the Red Death,English,gothic literature,1842,THE MASQUE OF THE RED DEATH. The “Red Death” h...,"(THE, MASQUE, OF, THE, RED, DEATH, ., The, “, ...","[THE, MASQUE, OF, THE, RED, DEATH, ., The, “, ..."
3,poe_the tell-tale heart,Edgar Allan Poe,The Tell-Tale Heart,English,gothic literature,1843,"THE TELL-TALE HEART. True!—nervous—very, very ...","(THE, TELL, -, TALE, HEART, ., True!—nervous, ...","[THE, TELL, -, TALE, HEART, ., True!—nervous, ..."


In [17]:
tokens = poemeta_df[['Text', 'Tokens']].copy()
tokens

Unnamed: 0,Text,Tokens
0,THE ASSIGNATION Stay for me there! I will not ...,"[THE, ASSIGNATION, Stay, for, me, there, !, I,..."
1,"THE BLACK CAT. For the most wild, yet most hom...","[THE, BLACK, CAT, ., For, the, most, wild, ,, ..."
2,THE MASQUE OF THE RED DEATH. The “Red Death” h...,"[THE, MASQUE, OF, THE, RED, DEATH, ., The, “, ..."
3,"THE TELL-TALE HEART. True!—nervous—very, very ...","[THE, TELL, -, TALE, HEART, ., True!—nervous, ..."


In [18]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
poemeta_df['Lemmas'] = poemeta_df['Doc'].apply(get_lemma)

In [19]:
poemeta_df

Unnamed: 0,Filename,Author,Title,Language,Genre,First published,Text,Doc,Tokens,Lemmas
0,poe_the assignation,Edgar Allan Poe,The Assignation,English,gothic literature,1834,THE ASSIGNATION Stay for me there! I will not ...,"(THE, ASSIGNATION, Stay, for, me, there, !, I,...","[THE, ASSIGNATION, Stay, for, me, there, !, I,...","[the, ASSIGNATION, stay, for, I, there, !, I, ..."
1,poe_the black cat,Edgar Allan Poe,The Black Cat,English,gothic literature,1843,"THE BLACK CAT. For the most wild, yet most hom...","(THE, BLACK, CAT, ., For, the, most, wild, ,, ...","[THE, BLACK, CAT, ., For, the, most, wild, ,, ...","[the, BLACK, CAT, ., for, the, most, wild, ,, ..."
2,poe_the masque of the red death,Edgar Allan Poe,The Masque of the Red Death,English,gothic literature,1842,THE MASQUE OF THE RED DEATH. The “Red Death” h...,"(THE, MASQUE, OF, THE, RED, DEATH, ., The, “, ...","[THE, MASQUE, OF, THE, RED, DEATH, ., The, “, ...","[the, MASQUE, of, the, RED, DEATH, ., the, "", ..."
3,poe_the tell-tale heart,Edgar Allan Poe,The Tell-Tale Heart,English,gothic literature,1843,"THE TELL-TALE HEART. True!—nervous—very, very ...","(THE, TELL, -, TALE, HEART, ., True!—nervous, ...","[THE, TELL, -, TALE, HEART, ., True!—nervous, ...","[the, TELL, -, TALE, HEART, ., true!—nervous, ..."


In [33]:
print(f'"die" appears in the text tokens column ' + str(poemeta_df['Tokens'].apply(lambda x: x.count('die')).sum()) + ' times.')
print(f'"die" appears in the lemmas column ' + str(poemeta_df['Lemmas'].apply(lambda x: x.count('die')).sum()) + ' times.')

"die" appears in the text tokens column 5 times.
"die" appears in the lemmas column 9 times.


In [21]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
poemeta_df['POS'] = poemeta_df['Doc'].apply(get_pos)

In [22]:
list(poemeta_df['POS'])

[[('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('VERB', 'VB'),
  ('ADP', 'IN'),
  ('PRON', 'PRP'),
  ('ADV', 'RB'),
  ('PUNCT', '.'),
  ('PRON', 'PRP'),
  ('AUX', 'MD'),
  ('PART', 'RB'),
  ('VERB', 'VB'),
  ('PUNCT', '.'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NN'),
  ('PUNCT', '.'),
  ('PUNCT', '-LRB-'),
  ('PUNCT', 'NFP'),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NN'),
  ('PUNCT', ','),
  ('ADP', 'IN'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PUNCT', ','),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('PROPN', 'NNP'),
  ('PUNCT', 'NFP'),
  ('PUNCT', '.'),
  ('PUNCT', '-RRB-'),
  ('PROPN', 'NNP'),
  ('PUNCT', 'HYPH'),
  ('ADJ', 'JJ'),
  ('CCONJ', 'CC'),
  ('ADJ', 'JJ'),
  ('VERB', 'VBN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('NOUN', 'NN'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NN'),
  ('PUNCT', ','),
  ('CCONJ'

In [23]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
poemeta_df['Proper_Nouns'] = poemeta_df['Doc'].apply(extract_proper_nouns)

In [24]:
list(poemeta_df['Proper_Nouns'])

[['ASSIGNATION',
  'Exequy',
  'Henry',
  'King',
  'Bishop',
  'Chichester',
  'Ill',
  'thou',
  'cold',
  'thou',
  '_',
  'Venice',
  'Elysium',
  'thou',
  '_',
  'shouldst',
  'Venice',
  '_',
  'Ponte',
  'Sospiri',
  'Bridge',
  'Sighs',
  'Genius',
  'Romance',
  'Piazza',
  'Campanile',
  'Ducal',
  'Palace',
  'Piazetta',
  'Grand',
  'Canal',
  'San',
  'Marco',
  'Bridge',
  'Sighs',
  'Ducal',
  'Palace',
  'Marchesa',
  'Aphrodite',
  'Venice',
  'Mentoni',
  'snowy',
  'Niobe',
  'Old',
  'Republic',
  'Venice',
  '_',
  '_',
  'Marchesa',
  'Mentoni',
  'Nonsense!—Who',
  'Marchesa',
  'Satyr',
  'Mentoni',
  'Old',
  'Marchesa',
  'Marchesa',
  'Europe',
  'Marchesa',
  '_',
  'stranger—_another',
  '’s',
  '_',
  'Marchesa',
  'Pliny',
  'Napoli',
  '_',
  '_',
  '_',
  '_',
  'Mentoni',
  'Thou',
  'thou',
  'hast',
  'Bridge',
  'Sighs',
  'Herculean',
  'Emperor',
  'Commodus',
  'Palazzo',
  'Grand',
  'Canal',
  'Rialto',
  'Report',
  'Europe',
  '_',
  '_',
  

In [25]:
spacy.explain("JJ")

'adjective (English), other noun-modifier (Chinese)'

In [26]:
# I believe that in Poe's short stories the adjectives are very distinctive of this genre, that is why I wanted to extract them.
def extract_adjectives(doc):
    return [token.text for token in doc if token.pos_ == 'ADJ']

# Apply function to Doc column and store resulting proper nouns in new column
poemeta_df['Adjectives'] = poemeta_df['Doc'].apply(extract_adjectives)

In [27]:
list(poemeta_df.loc[[1], 'Adjectives'])

[['wild',
  'about',
  'Mad',
  'very',
  'own',
  'mad',
  'immediate',
  'mere',
  'little',
  'many',
  'terrible',
  'common',
  'calm',
  'logical',
  'excitable',
  'own',
  'more',
  'ordinary',
  'natural',
  'conspicuous',
  'fond',
  'great',
  'most',
  'happy',
  'principal',
  'faithful',
  'sagacious',
  'derivable',
  'unselfish',
  'frequent',
  'mere',
  'happy',
  'uncongenial',
  'own',
  'domestic',
  'agreeable',
  'fine',
  'small',
  'latter',
  'large',
  'beautiful',
  'black',
  'sagacious',
  'astonishing',
  'little',
  'tinctured',
  'frequent',
  'ancient',
  'popular',
  'black',
  'serious',
  'better',
  'favorite',
  'several',
  'general',
  'radical',
  'worse',
  'moody',
  'irritable',
  'intemperate',
  'personal',
  'sufficient',
  'old',
  'ill',
  'intoxicated',
  'slight',
  'original',
  'more',
  'fiendish',
  'poor',
  'damnable',
  'guilty',
  'best',
  'feeble',
  'equivocal',
  'untouched',
  'lost',
  'true',
  'frightful',
  'usual',
 

In [28]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [29]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
poemeta_df['Named_Entities'] = poemeta_df['Doc'].apply(extract_named_entities)
poemeta_df['Named_Entities']

0    [PERSON, PERSON, GPE, ORG, NORP, TIME, GPE, PE...
1    [ORG, DATE, EVENT, TIME, TIME, CARDINAL, CARDI...
2    [ORG, PRODUCT, TIME, GPE, CARDINAL, CARDINAL, ...
3    [LOC, ORDINAL, DATE, TIME, TIME, TIME, DATE, T...
Name: Named_Entities, dtype: object

In [30]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
poemeta_df['NE_Words'] = poemeta_df['Doc'].apply(extract_named_entities)
poemeta_df['NE_Words']

0    [(Exequy), (Henry, King), (Venice), (Elysium, ...
1    [(THE, BLACK, CAT), (several, years), (the, Fi...
2    [(The, “, Red, Death), (Avatar), (half, an, ho...
3    [(earth), (first), (the, whole, week), (every,...
Name: NE_Words, dtype: object

In [31]:
# Extract the first Doc object
doc = poemeta_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

In [33]:
# Use this step only to save  csv to your computer's working directory
poemeta_df.to_csv('poe_short_stories_with_spaCy_tags.csv')