In [None]:
!python -m spacy download en_core_web_trf
!pip install spacy
!pip install spacy-transformers

In [None]:
import spacy
import spacy_transformers

In [2]:
import pandas as pd
import requests
from tqdm import tqdm
from collections import Counter
import json

In [None]:
lit_df = pd.read_csv('/content/drive/MyDrive/NLP_2023/literature.csv')
lit_df = lit_df[lit_df['author']!='William Shakespeare']
lit_df

Unnamed: 0,author,title,year,url,language,source,period
0,Jonathan Swift,A Tale of the Tub,1704,https://www.gutenberg.org/files/4737/4737-0.txt,English,Gutenberg,Augustan literature
1,Jonathan Swift,The Journal to Stella,1710,https://www.gutenberg.org/files/4208/4208-0.txt,English,Gutenberg,Augustan literature
2,Jonathan Swift,The Prose Works of Jonathan Swift,1724,https://www.gutenberg.org/cache/epub/12784/pg1...,English,Gutenberg,Augustan literature
3,Jonathan Swift,Gulliver's Travels,1726,https://www.gutenberg.org/files/829/829-0.txt,English,Gutenberg,Augustan literature
4,Jane Austen,Sense and Sensibility,1811,https://www.gutenberg.org/files/161/161-0.txt,English,Gutenberg,Romanticism
5,Jane Austen,Pride and Prejudice,1813,https://www.gutenberg.org/cache/epub/1342/pg13...,English,Gutenberg,Romanticism
6,Jane Austen,Mansfield Park,1814,https://www.gutenberg.org/cache/epub/141/pg141...,English,Gutenberg,Romanticism
7,Jane Austen,Emma,1815,https://www.gutenberg.org/cache/epub/158/pg158...,English,Gutenberg,Romanticism
8,Jane Austen,Persuasion,1818,https://www.gutenberg.org/cache/epub/105/pg105...,English,Gutenberg,Romanticism
9,Jane Austen,Northanger Abbey,1818,https://www.gutenberg.org/cache/epub/121/pg121...,English,Gutenberg,Romanticism


In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
def break_down_text(url):
  pos_list = []
  ent_list = []
  
  page = requests.get(url)
  clean_text = ('\r\n\r\n').join([x for x in page.text.split('\r\n\r\n') if 'gutenberg' not in x.lower()]).split('START: FULL LICENSE')[0]
  split_text = clean_text.split('\n')
  # split_text = clean_text.split('\n')[1000:2000] ## for testing 
  
  for entry in split_text:
  # for entry in tqdm(split_text):
    doc = nlp(entry)
    for token in doc:
      pos_list.append((token.text, token.pos_))
    for ent in doc.ents:
      ent_list.append((ent.text, ent.label_))
  
  counter_pos = Counter(pos_list)
  counter_ents = Counter(ent_list)

  return counter_pos.most_common(), counter_ents.most_common()

In [None]:
url = 'https://www.gutenberg.org/cache/epub/863/pg863.txt'

pos, ents = break_down_text(url)

100%|██████████| 8533/8533 [26:37<00:00,  5.34it/s]


In [None]:
def create_df(pos,ents,threshold=5):
  ents = [[x[0][0],x[0][1],x[1],'ENT'] for x in ents if x[1] > threshold]#[:top_k]
  pos = [[x[0][0],x[0][1],x[1], 'POS'] for x in pos if x[1] > threshold]#[:top_k]
  df = pd.concat([pd.DataFrame(pos,columns=['text','type','count','category']),pd.DataFrame(ents,columns=['text','type','count','category'])])
  df = df[(df['type']!='PUNCT')&(df['type']!='PART')&(df['type']!='SPACE')]

  return df

In [None]:
df = create_df(pos,ents)
df

Unnamed: 0,text,type,count,category
3,the,DET,2377,POS
6,I,PRON,1780,POS
7,of,ADP,1239,POS
8,a,DET,1135,POS
9,and,CCONJ,1011,POS
...,...,...,...,...
58,Styles Court,FAC,6,ENT
59,Hercule Poirot,PERSON,6,ENT
60,Leastways Cottage,FAC,6,ENT
61,Inglethorp,GPE,6,ENT


In [None]:
def nest_df(df, threshold=2):
  pos_list = []
  ent_list = []
  df = df[(df['count']>threshold)]

  pos_cats = list(set(df[df['category']=='POS']['type']))
  ent_cats = list(set(df[df['category']=='ENT']['type']))

  for pos in pos_cats:
    pos_list.append({'name':pos,'children':[]})
  for ent in ent_cats:
    ent_list.append({'name':ent,'children':[]})

  for pos in pos_list:
    df_reduce = df[df['type']==pos['name']]
    word_collection = []
    for idx in range(len(df_reduce)):
      word_collection.append({'name':df_reduce['text'].iloc[idx], 'value':int(df_reduce['count'].iloc[idx])})
    pos['children'] = word_collection

  for ent in ent_list:
    df_reduce = df[df['type']==ent['name']]
    word_collection = []
    for idx in range(len(df_reduce)):
      word_collection.append({'name':df_reduce['text'].iloc[idx], 'value':int(df_reduce['count'].iloc[idx])})
    ent['children'] = word_collection

  return pos_list, ent_list

In [None]:
#nest_df(df, threshold=2)

In [None]:
def create_hierarchical_data(lit_df,period,threshold=0):
  ## PERIOD
  ## AUTHOR
  ## BOOK
  ## ENTS / POS
  ## POS Types / Ent Types
  ## Alphabetical ?
  data = {}

  periods = [period]

  data['name'] = 'literature'
  data['children'] = []

  for period in periods:
    data['children'].append({'name':period, 'children':[]})
    
  for entry in data['children']:
    period = entry['name']
    authors = list(set(lit_df[lit_df['period']==period]['author']))
    for auth in authors:
      entry['children'].append({'name': auth, 'children':[]})

  for entry in data['children']:
    for author in entry['children']:
      books = list(set(lit_df[lit_df['author']==author['name']]['title']))
      for book in books:
        author['children'].append({'name': book, 'children':[]})

  for entry in data['children']:
    for author in entry['children']:
      for book in author['children']:
        print(f"Preparing {book['name']} -- By {author['name']}")
        url = lit_df[lit_df['title']==book['name']]['url'].iloc[0]
        pos, ents = break_down_text(url)
        df = create_df(pos,ents)
        pos_types, ent_types = nest_df(df, threshold)

        book['children'].append({'name':'parts_of_speech', 'children':pos_types})
        book['children'].append({'name':'named_entities', 'children':ent_types})
        
          
  dumps = json.dumps(data, indent=4)  

  with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{period}.json", "w") as outfile:
      outfile.write(dumps)

  return data

In [None]:
periods = list(set(lit_df['period']))
periods

['Modernism',
 'American Realism',
 'Proto-Modernism',
 'Victorian',
 'Augustan literature',
 'Genre Fiction',
 'Romanticism']

In [None]:
for period in periods:
  create_hierarchical_data(lit_df,period,threshold=0)

Preparing The Murder of Roger Ackroyd -- By Agatha Christie
Preparing The Man in the Brown Suit -- By Agatha Christie
Preparing The Mysterious Affair at Styles -- By Agatha Christie
Preparing The Secret Adversary -- By Agatha Christie
Preparing The Murder on the Links -- By Agatha Christie
Preparing A Tale of the Tub -- By Jonathan Swift
Preparing The Journal to Stella -- By Jonathan Swift
Preparing The Prose Works of Jonathan Swift -- By Jonathan Swift
Preparing Gulliver's Travels -- By Jonathan Swift
Preparing The Prince and The Pauper -- By Mark Twain
Preparing A Connecticut Yankee in King Arthur's Court -- By Mark Twain
Preparing Life on the Mississippi -- By Mark Twain
Preparing The Tragedy of Pudd'nhead Wilson -- By Mark Twain
Preparing The Adventures of Tom Sawyer -- By Mark Twain
Preparing Adventures of Huckleberry Finn -- By Mark Twain
Preparing Innocents Abroad -- By Mark Twain
Preparing The Time Machine -- By H.G. Wells
Preparing The War of the Worlds -- By H.G. Wells
Prepar

Filtering JSON Data for Visualizations

In [None]:
### Set Minimum Number of Examples

def set_minimum(file, min=50):
  f = open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file}.json")
  data = json.load(f)
  new_data = data

  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      for category in book['children']:
        for subcategory in category['children']:
          new_children = []
          for word in subcategory['children']:
            if word['value'] > min:
              new_children.append(word)
          subcategory['children'] = new_children

  dumps = json.dumps(new_data, indent=4)  

  with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file.lower().replace(' ','_')}_min_{min}.json", "w") as outfile:
      outfile.write(dumps)


  return new_data

set_minimum('American Realism', min=5)


In [None]:
### Extract only Entities (with min)

f = open("/content/drive/MyDrive/NLP_2023/lit_breakdown/Romanticism.json")

data = json.load(f)

def only_entities(file, min=5):
  f = open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file}.json")
  data = json.load(f)
  new_data = data

  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      for category in book['children']:
        for subcategory in category['children']:
          new_children = []
          for word in subcategory['children']:
            if word['value'] > min:
              new_children.append(word)
          subcategory['children'] = new_children

  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      new_entities = []
      for category in book['children']:
        if category['name'] == 'Named Entities':
          new_entities.append(category['children'])
      if len(new_entities) == 1:
        book['children'] = new_entities[0]
      

  dumps = json.dumps(new_data, indent=4)  

  with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file.lower().replace(' ','_')}_ents_min_{min}.json", "w") as outfile:
      outfile.write(dumps)
  return new_data
  
# only_entities('Romanticism', min=5)

In [8]:
## This function extracts a particular category of entity



def only_one_type_entities(file, ent='GPE', min=1):
  f = open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file}.json")

  data = json.load(f)
  new_data = data

  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      for category in book['children']:
        for subcategory in category['children']:
          new_children = []
          for word in subcategory['children']:
            if word['value'] > min:
              new_children.append(word)
          subcategory['children'] = new_children

  # books = new_data['children'][0]['children'][1]['children']
  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      for category in book['children']:
        new_category = []
        for subcategory in category['children']:
          if subcategory['name'] == ent:
            new_category.append(subcategory['children'])
        if len(new_category) == 1:
          book['children'] = new_category[0]

  return {"name": file, "children": new_data['children'][0]['children']}

# only_one_type_entities(data, ent='GPE')

jane_austen_places = only_one_type_entities('American Realism', ent='GPE', min=5)

# dumps = json.dumps(jane_austen_places, indent=4)  

# with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/jane_austen_places.json", "w") as outfile:
#     outfile.write(dumps)

jane_austen_places

{'name': 'American Realism',
 'children': [{'name': 'Mark Twain',
   'children': [{'name': 'The Prince and The Pauper',
     'children': [{'name': 'England', 'value': 52},
      {'name': 'London', 'value': 29},
      {'name': 'Westminster', 'value': 8},
      {'name': 'Southwark', 'value': 8}]},
    {'name': "A Connecticut Yankee in King Arthur's Court",
     'children': [{'name': 'England', 'value': 32},
      {'name': 'London', 'value': 15},
      {'name': 'Britain', 'value': 14},
      {'name': 'Ireland', 'value': 7}]},
    {'name': 'Life on the Mississippi',
     'children': [{'name': 'New Orleans', 'value': 104},
      {'name': 'StLouis', 'value': 77},
      {'name': 'Cairo', 'value': 31},
      {'name': 'Vicksburg', 'value': 28},
      {'name': 'Missouri', 'value': 26},
      {'name': 'Memphis', 'value': 26},
      {'name': 'Arkansas', 'value': 23},
      {'name': 'StPaul', 'value': 22},
      {'name': 'Orleans', 'value': 19},
      {'name': 'Natchez', 'value': 17},
      {'name'

In [9]:
### Subdivide Alphabetically with Minimum

def subdivide_alphabetically(file, min=50):

  f = open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file}.json")

  data = json.load(f)

  new_data = data

  authors = new_data['children'][0]['children']
  for author in authors:
    for book in author['children']:
      for category in book['children']:
        for subcategory in category['children']:
          new_children = []
          for word in subcategory['children']:
            if word['value'] > min:
              new_children.append(word)
          subcategory['children'] = new_children
          letter_set = list(set([word['name'][0].upper() for word in subcategory['children']]))
          letter_set.sort()
          alphabetized = [{"name":letter, "children":[]} for letter in letter_set]
          # print(alphabetized)

          
          for alphabet in alphabetized:
            for word in subcategory['children']:
              if word['name'][0].upper() == alphabet['name']:
                alphabet['children'].append(word)
          subcategory['children'] = alphabetized
         

  return new_data

file = 'American Realism'

min = 25

new_data = subdivide_alphabetically(file, min=min)

dumps = json.dumps(new_data, indent=4)  

with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/{file.lower().replace(' ','_')}_alphabetized_min_{min}.json", "w") as outfile:
    outfile.write(dumps)



In [11]:
categories = ['Modernism',
 'American Realism',
 'Proto-Modernism',
 'Victorian',
 'Augustan literature',
 'Genre Fiction',
 'Romanticism']

full_literature_data = {
    "name": 'literature',
    "children" : [only_one_type_entities(cat, ent='PERSON', min=5) for cat in categories]

}

dumps = json.dumps(full_literature_data, indent=4)  

with open(f"/content/drive/MyDrive/NLP_2023/lit_breakdown/full_lit_alphabetized_PERSON.json", "w") as outfile:
    outfile.write(dumps)
