In [None]:
!python -m spacy download en_core_web_trf
!pip install spacy
!pip install spacy-transformers

In [None]:
import spacy
import spacy_transformers



In [None]:
import pandas as pd
import requests
from tqdm import tqdm
from collections import Counter
import json

In [None]:
lit_df = pd.read_csv('literature.csv')
lit_df = lit_df[lit_df['author']!='William Shakespeare']
lit_df

In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
lit_df[lit_df['author']=='Jane Austen'].iloc[0]['url']
lit_df[lit_df['author']=='Jane Austen'].iloc[0]['title']

'Sense and Sensibility'

In [None]:
def extract_adjective_noun_pairs(url):
  page = requests.get(url)
  clean_text = ('\r\n\r\n').join([x for x in page.text.split('\r\n\r\n') if 'gutenberg' not in x.lower()]).split('START: FULL LICENSE')[0]
  split_text = clean_text.split('\n')
  
  results = []
  for entry in tqdm(split_text):
    doc = nlp(entry)
    for token in doc:
        if (token.dep_ == 'amod') & (token.head.pos_ == 'NOUN'):
          results.append(('amod',token.text,token.head.text))
  
  counter = Counter(results)

  return counter.most_common()

test = extract_adjective_noun_pairs('https://www.gutenberg.org/files/161/161-0.txt')

100%|██████████| 12693/12693 [24:40<00:00,  8.57it/s]


In [None]:
def create_network_data(data, title, source='adjective',target='noun', threshold=3):
  nodes = [{"id":x[0][1],"type":source} for x in data if x[1] > threshold] + [{"id":x[0][2],"type":target} for x in data if x[1] > threshold]
  nodes = list({v['id']:v for v in nodes}.values())
  links = [{"source":x[0][1],"target":x[0][2],"count":x[1]} for x in data if x[1] > threshold]
  
  dumps = json.dumps({"nodes":nodes,"links":links}, indent=4)  

  with open(f"/content/drive/MyDrive/NLP_2023/pos_network_data/{title}-{source}-{target}.json", "w") as outfile:
      outfile.write(dumps)
 
  return nodes


In [None]:
def extract_all(df):
  for idx in range(len(df)):
    title = df.iloc[idx]['title'].lower().replace(' ','-')
    data = extract_adjective_noun_pairs(df.iloc[idx]['url'])
    create_network_data(data, title, source='adjective',target='noun', threshold=3)
    print(title)
  
extract_all(lit_df)

100%|██████████| 4455/4455 [09:32<00:00,  7.79it/s]


a-tale-of-the-tub


100%|██████████| 22306/22306 [49:57<00:00,  7.44it/s]


the-journal-to-stella


100%|██████████| 9199/9199 [18:39<00:00,  8.22it/s]


the-prose-works-of-jonathan-swift


100%|██████████| 9561/9561 [20:11<00:00,  7.89it/s]


gulliver's-travels


100%|██████████| 12693/12693 [25:15<00:00,  8.38it/s]


sense-and-sensibility


100%|██████████| 14550/14550 [27:21<00:00,  8.87it/s]


pride-and-prejudice


100%|██████████| 15692/15692 [30:37<00:00,  8.54it/s]


mansfield-park


100%|██████████| 16510/16510 [32:15<00:00,  8.53it/s]


emma


100%|██████████| 8375/8375 [16:03<00:00,  8.69it/s]


persuasion


100%|██████████| 8014/8014 [15:26<00:00,  8.65it/s]


northanger-abbey


100%|██████████| 7381/7381 [14:10<00:00,  8.68it/s]


frankenstein


100%|██████████| 4631/4631 [09:09<00:00,  8.42it/s]


mathilda


100%|██████████| 16557/16557 [33:02<00:00,  8.35it/s]


the-last-man


100%|██████████| 18831/18831 [35:57<00:00,  8.73it/s]


oliver-twist


100%|██████████| 38218/38218 [1:14:59<00:00,  8.49it/s]


david-copperfield


100%|██████████| 39870/39870 [1:14:53<00:00,  8.87it/s]


bleak-house


100%|██████████| 11674/11674 [24:10<00:00,  8.05it/s]


hard-times


100%|██████████| 15924/15924 [31:16<00:00,  8.49it/s]


a-tale-of-two-cities


100%|██████████| 20417/20417 [41:53<00:00,  8.12it/s]


great-expectations


100%|██████████| 18752/18752 [38:16<00:00,  8.17it/s]


innocents-abroad


100%|██████████| 8894/8894 [17:56<00:00,  8.26it/s]


the-adventures-of-tom-sawyer


100%|██████████| 14834/14834 [29:06<00:00,  8.49it/s]


life-on-the-mississippi


100%|██████████| 8176/8176 [16:24<00:00,  8.31it/s]


the-prince-and-the-pauper


100%|██████████| 12031/12031 [24:01<00:00,  8.35it/s]


adventures-of-huckleberry-finn


100%|██████████| 12863/12863 [24:56<00:00,  8.59it/s]


a-connecticut-yankee-in-king-arthur's-court


100%|██████████| 5972/5972 [11:31<00:00,  8.64it/s]


the-tragedy-of-pudd'nhead-wilson


100%|██████████| 3195/3195 [06:31<00:00,  8.17it/s]


the-time-machine


100%|██████████| 4736/4736 [09:23<00:00,  8.40it/s]


the-island-of-doctor-moreau


100%|██████████| 5776/5776 [11:25<00:00,  8.43it/s]


the-invisible-man


100%|██████████| 6388/6388 [12:54<00:00,  8.25it/s]


the-war-of-the-worlds


100%|██████████| 1/1 [00:00<00:00, 1295.74it/s]


heart-of-darkness


100%|██████████| 11300/11300 [25:01<00:00,  7.53it/s]


lord-jim


 18%|█▊        | 3148/17185 [06:34<27:49,  8.41it/s]

In [None]:

doc = nlp("He ran quickly up the stairs")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])


He nsubj ran VERB []
ran ROOT ran VERB [He, quickly, up]
quickly advmod ran VERB []
up prep ran VERB [stairs]
the det stairs NOUN []
stairs pobj up ADP [the]


In [None]:
def extract_root_pairs(url):
  
  page = requests.get(url)
  clean_text = ('\r\n\r\n').join([x for x in page.text.split('\r\n\r\n') if 'gutenberg' not in x.lower()]).split('START: FULL LICENSE')[0]
  split_text = clean_text.split('\n')
  
  results = []
  for entry in tqdm(split_text):
    doc = nlp(entry)
    for token in doc:
      if (token.dep_ == 'nsubj') & (token.pos_ == 'PROPN') & (token.head.pos_ == 'VERB'):
          result = ('nsubj',token.text,token.head.text,[x.text for x in token.rights])
          print(result)
          results.append(result)
  
  counter = Counter(results)

  return counter.most_common()

test = extract_root_pairs('https://www.gutenberg.org/files/161/161-0.txt')

In [None]:
test

[(('amod', 'old', 'gentleman'), 108),
 (('amod', 'old', 'lady'), 65),
 (('amod', 'young', 'lady'), 56),
 (('amod', 'young', 'man'), 33),
 (('amod', 'old', 'man'), 33),
 (('amod', 'young', 'gentleman'), 28),
 (('amod', 'old', 'woman'), 20),
 (('amod', 'great', 'deal'), 20),
 (('amod', 'white', 'waistcoat'), 19),
 (('amod', 'public', 'house'), 19),
 (('amod', 'long', 'time'), 18),
 (('amod', 'cocked', 'hat'), 18),
 (('amod', 'young', 'woman'), 17),
 (('amod', 'low', 'voice'), 16),
 (('amod', 'few', 'words'), 14),
 (('amod', 'right', 'hand'), 13),
 (('amod', 'poor', 'boy'), 12),
 (('amod', 'next', 'morning'), 11),
 (('amod', 'little', 'room'), 11),
 (('amod', 'next', 'day'), 11),
 (('amod', 'dear', 'lady'), 11),
 (('amod', 'first', 'time'), 10),
 (('amod', 'good', 'deal'), 10),
 (('amod', 'great', 'number'), 10),
 (('amod', 'same', 'time'), 10),
 (('amod', 'young', 'friend'), 10),
 (('amod', 'last', 'night'), 9),
 (('amod', 'old', 'women'), 9),
 (('amod', 'great', 'coat'), 9),
 (('amod', 

In [None]:
def create_df(pos,ents,top_k=30):
  ents = [[x[0][0],x[0][1],x[1],'ENT'] for x in ents][:top_k]
  pos = [[x[0][0],x[0][1],x[1], 'POS'] for x in pos][:top_k]
  df = pd.concat([pd.DataFrame(pos,columns=['text','type','count','category']),pd.DataFrame(ents,columns=['text','type','count','category'])])
  df = df[(df['type']!='PUNCT')&(df['type']!='PART')&(df['type']!='SPACE')]

  return df

In [None]:
# df = create_df(pos,ents)
# df

In [None]:
def nest_df(df, threshold=2):
  pos_list = []
  ent_list = []
  df = df[(df['count']>threshold)]

  pos_cats = list(set(df[df['category']=='POS']['type']))
  ent_cats = list(set(df[df['category']=='ENT']['type']))

  for pos in pos_cats:
    pos_list.append({'name':pos,'children':[]})
  for ent in ent_cats:
    ent_list.append({'name':ent,'children':[]})

  for pos in pos_list:
    df_reduce = df[df['type']==pos['name']]
    word_collection = []
    for idx in range(len(df_reduce)):
      word_collection.append({'name':df_reduce['text'].iloc[idx], 'value':int(df_reduce['count'].iloc[idx])})
    pos['children'] = word_collection

  for ent in ent_list:
    df_reduce = df[df['type']==ent['name']]
    word_collection = []
    for idx in range(len(df_reduce)):
      word_collection.append({'name':df_reduce['text'].iloc[idx], 'value':int(df_reduce['count'].iloc[idx])})
    ent['children'] = word_collection

  return pos_list, ent_list

In [None]:
#nest_df(df, threshold=2)

In [None]:
def create_hierarchical_data(lit_df, threshold=0):
  ## PERIOD
  ## AUTHOR
  ## BOOK
  ## ENTS / POS
  ## POS Types / Ent Types
  ## Alphabetical ?
  data = {}

  periods = list(set(lit_df['period']))

  data['name'] = 'literature'
  data['children'] = []

  for period in periods:
    data['children'].append({'name':period, 'children':[]})
    
  for entry in data['children']:
    period = entry['name']
    authors = list(set(lit_df[lit_df['period']==period]['author']))
    for auth in authors:
      entry['children'].append({'name': auth, 'children':[]})

  for entry in data['children']:
    for author in entry['children']:
      books = list(set(lit_df[lit_df['author']==author['name']]['title']))
      for book in books:
        author['children'].append({'name': book, 'children':[]})

  for entry in tqdm(data['children']):
    for author in entry['children']:
      for book in author['children']:
        url = lit_df[lit_df['title']==book['name']]['url'].iloc[0]
        pos, ents = break_down_text(url)
        df = create_df(pos,ents)
        pos_types, ent_types = nest_df(df, threshold)

        book['children'].append({'name':'parts_of_speech', 'children':pos_types})
        book['children'].append({'name':'named_entities', 'children':ent_types})
        
          
  dumps = json.dumps(data, indent=4)  

  with open("/content/drive/MyDrive/NLP_2023/literature_hierarchy.json", "w") as outfile:
      outfile.write(dumps)

  return data

create_hierarchical_data(lit_df, threshold=5)
  

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 21%|██▏       | 3546/16557 [10:48<45:41,  4.75it/s][A
 21%|██▏       | 3547/16557 [10:48<46:01,  4.71it/s][A
 21%|██▏       | 3548/16557 [10:48<41:27,  5.23it/s][A
 21%|██▏       | 3549/16557 [10:48<42:55,  5.05it/s][A
 21%|██▏       | 3550/16557 [10:48<44:13,  4.90it/s][A
 21%|██▏       | 3551/16557 [10:49<44:49,  4.84it/s][A
 21%|██▏       | 3552/16557 [10:49<44:59,  4.82it/s][A
 21%|██▏       | 3553/16557 [10:49<45:56,  4.72it/s][A
 21%|██▏       | 3554/16557 [10:49<46:44,  4.64it/s][A
 21%|██▏       | 3555/16557 [10:50<43:37,  4.97it/s][A
 21%|██▏       | 3556/16557 [10:50<41:53,  5.17it/s][A
 21%|██▏       | 3557/16557 [10:50<40:52,  5.30it/s][A
 21%|██▏       | 3558/16557 [10:50<40:26,  5.36it/s][A
 21%|██▏       | 3559/16557 [10:50<40:18,  5.37it/s][A
 22%|██▏       | 3560/16557 [10:50<39:16,  5.52it/s][A
 22%|██▏       | 3561/16557 [10:51<38:24,  5.64it/s][A
 22%|██▏       | 3562/16557 [10:51<34:4