# Text Classification - Augmented Dataset

---

## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Scraping - getting our data
* Splitting - formatting data into datapoints
* Analysis - distribution and size of data
* Data - formatting into Pandas and adding more metadata
* Subset and Save - train/dev/test set and pickling

## $\color{blue}{Preamble:}$
This note book ammends the initial dataset by using different chunking lengths to create more text embeddings, an augmented dataset.


## $\color{blue}{Admin:}$


In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install langchain langchain-community bs4 llama-index

In [None]:
from bs4 import BeautifulSoup
import re

## $\color{blue}{Scraping:}$


### $\color{red}{Ulysses:}$


In [None]:

# Load the HTML file
with open('class/data/ulysses_text.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
# Initialize a list to hold all episodes
ulysses_episodes = []
last_book_title = None

# Iterate through each 'div' with the class 'chapter'
for chapter in soup.find_all('div', class_='chapter'):
    # Check for a book title (h2) above the current chapter
    book_title_tag = chapter.find_previous('h2')
    if book_title_tag:
        last_book_title = book_title_tag.get_text(strip=True)

    # Get the episode title from the current chapter (h3)
    episode_title_tag = chapter.find('h3')
    if episode_title_tag:
        episode_title = episode_title_tag.get_text(strip=True)
    else:
        continue  # Skip if there is no episode title (h3)

    # Initialize a dictionary for the current episode
    episode_data = {
        'master': 'Ulysses',
        'book': last_book_title,
        'episode': episode_title,
        'content': ''
    }

    # Gather all paragraphs within the current chapter
    for paragraph in chapter.find_all('p'):
        episode_data['content'] += paragraph.get_text() + ' '  # Add space to separate paragraphs

    # Clean up the content by stripping whitespace
    episode_data['content'] = episode_data['content'].replace("\n"," ")

    # Append episode data to the list of episodes
    ulysses_episodes.append(episode_data)

In [None]:
len(ulysses_episodes)

18

-

In [None]:
ulysses_master = []
ulysses_book = []
ulysses_chapter = []
ulysses_text = []

for item in ulysses_episodes:
  # get master
  ulysses_master.append(item['master'])

  # get book number
  ulysses_book.append(len(re.findall('I',item['book']))-1)

  # get chapter number
  number = ''
  for char in item['episode']:
    if char.isnumeric():
      number += char
  ulysses_chapter.append(int(number)-1)

  # get text
  ulysses_text.append(item['content'])

In [None]:
print(ulysses_master)
print(ulysses_book)
print(ulysses_chapter)
print(len(ulysses_text))

['Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses', 'Ulysses']
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
18


### $\color{red}{Dubliners:}$


In [None]:
# Load the HTML file
with open('class/data/dubliners_text.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
dubliners_episodes = []

# Iterate through each 'div' with the class 'chapter'
for chapter in soup.find_all('div', class_='chapter'):
    # Get the episode title from the current chapter (h3)
    episode_title_tag = chapter.find('h2')
    if episode_title_tag:
        episode_title = episode_title_tag.get_text(strip=True)
    else:
        continue  # Skip if there is no episode title (h3)

    # Initialize a dictionary for the current episode
    episode_data = {
        'master': 'Dubliners',
        'book': 'Dubliners',
        'episode': episode_title,
        'content': ''
    }

    # Gather all paragraphs within the current chapter
    for paragraph in chapter.find_all('p'):
        episode_data['content'] += paragraph.get_text() + ' '  # Add space to separate paragraphs

    # Clean up the content by stripping whitespace
    episode_data['content'] = episode_data['content'].replace("\n"," ")

    # Append episode data to the list of episodes
    dubliners_episodes.append(episode_data)


In [None]:
dubliners_title = [episode['episode'] for episode in dubliners_episodes]
dubliners_inds = list(range(len(ulysses_episodes),len(ulysses_episodes) + len(dubliners_episodes)))
dublin_title = {dubliners_title[i]:dubliners_inds[i] for i in range(len(dubliners_episodes))}
dublin_title

{'THE SISTERS': 18,
 'AN ENCOUNTER': 19,
 'ARABY': 20,
 'EVELINE': 21,
 'AFTER THE RACE': 22,
 'TWO GALLANTS': 23,
 'THE BOARDING HOUSE': 24,
 'A LITTLE CLOUD': 25,
 'COUNTERPARTS': 26,
 'CLAY': 27,
 'A PAINFUL CASE': 28,
 'IVY DAY IN THE COMMITTEE ROOM': 29,
 'A MOTHER': 30,
 'GRACE': 31,
 'THE DEAD': 32}

In [None]:
dubliners_master = []
dubliners_book = []
dubliners_chapter = []
dubliners_text = []

for item in dubliners_episodes:
  # get master
  dubliners_master.append(item['master'])

  # get book number
  dubliners_book.append(3)

  # get chapter number
  dubliners_chapter.append(dublin_title[item['episode']])

  # get text
  dubliners_text.append(item['content'])

In [None]:
print(dubliners_master)
print(dubliners_book)
print(dubliners_chapter)
print(len(dubliners_text))

['Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners', 'Dubliners']
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
15


### $\color{red}{Dracula:}$


In [None]:
# Load the HTML file
with open('class/data/dracula_text.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
dracula_episodes = []

# Iterate through each 'div' with the class 'chapter'
for chapter in soup.find_all('div', class_='chapter'):
    # Get the episode title from the current chapter (h3)
    episode_title_tag = chapter.find('h2')
    if episode_title_tag:
        episode_title = episode_title_tag.get_text(strip=True)
    else:
        continue  # Skip if there is no episode title (h3)

    # Initialize a dictionary for the current episode
    episode_data = {
        'master': 'Dracula',
        'book': 'Dracula',
        'episode': episode_title,
        'content': ''
    }

    # Gather all paragraphs within the current chapter
    paragraphs = chapter.find('p')
    if paragraphs:
      for paragraph in chapter.find_all('p'):
          episode_data['content'] += paragraph.get_text() + ' '  # Add space to separate paragraphs

      # Clean up the content by stripping whitespace
      episode_data['content'] = episode_data['content'].replace("\n"," ")

      # Append episode data to the list of episodes
      dracula_episodes.append(episode_data)

In [None]:
len(dracula_episodes)

28

In [None]:
dracula_episodes = dracula_episodes[:-1]

In [None]:
dracula_title = [episode['episode'] for episode in dracula_episodes]
dracula_inds = list(range(len(ulysses_episodes) + len(dubliners_episodes),len(ulysses_episodes) + len(dubliners_episodes) + len(dracula_episodes)))
drac_title = {dracula_title[i]:dracula_inds[i] for i in range(len(dracula_episodes))}
drac_title

{'CHAPTER IJONATHAN HARKER’S JOURNAL': 33,
 'CHAPTER IIJONATHAN HARKER’S JOURNAL—continued': 34,
 'CHAPTER IIIJONATHAN HARKER’S JOURNAL—continued': 35,
 'CHAPTER IVJONATHAN HARKER’S JOURNAL—continued': 36,
 'CHAPTER V': 37,
 'CHAPTER VIMINA MURRAY’S JOURNAL': 38,
 'CHAPTER VIICUTTING FROM “THE DAILYGRAPH,” 8 AUGUST': 39,
 'CHAPTER VIIIMINA MURRAY’S JOURNAL': 40,
 'CHAPTER IX': 41,
 'CHAPTER X': 42,
 'CHAPTER XI': 43,
 'CHAPTER XIIDR. SEWARD’S DIARY': 44,
 'CHAPTER XIIIDR. SEWARD’S DIARY—continued.': 45,
 'CHAPTER XIVMINA HARKER’S JOURNAL': 46,
 'CHAPTER XVDR. SEWARD’S DIARY—continued.': 47,
 'CHAPTER XVIDR. SEWARD’S DIARY—continued': 48,
 'CHAPTER XVIIDR. SEWARD’S DIARY—continued': 49,
 'CHAPTER XVIIIDR. SEWARD’S DIARY': 50,
 'CHAPTER XIXJONATHAN HARKER’S JOURNAL': 51,
 'CHAPTER XXJONATHAN HARKER’S JOURNAL': 52,
 'CHAPTER XXIDR. SEWARD’S DIARY': 53,
 'CHAPTER XXIIJONATHAN HARKER’S JOURNAL': 54,
 'CHAPTER XXIIIDR. SEWARD’S DIARY': 55,
 'CHAPTER XXIVDR. SEWARD’S PHONOGRAPH DIARY, SPOKEN 

In [None]:
dracula_master = []
dracula_book = []
dracula_chapter = []
dracula_text = []

for item in dracula_episodes:
  # get master
  dracula_master.append(item['master'])

  # get book number
  dracula_book.append(4)

  # get chapter number
  dracula_chapter.append(drac_title[item['episode']])

  # get text
  dracula_text.append(item['content'])

In [None]:
print(dracula_master)
print(dracula_book)
print(dracula_chapter)
print(len(dracula_text))

['Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula', 'Dracula']
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
27


### $\color{red}{Republic:}$


In [None]:
# Load the HTML file
with open('class/data/republic_text.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
republic_episodes = []

# Iterate through each 'div' with the class 'chapter'
for chapter in soup.find_all('div', class_='chapter'):
    # Get the episode title from the current chapter (h3)
    episode_title_tag = chapter.find('h2')
    if episode_title_tag:
        episode_title = episode_title_tag.get_text(strip=True)
    else:
        continue  # Skip if there is no episode title (h3)

    # Initialize a dictionary for the current episode
    episode_data = {
        'master': 'Republic',
        'book': 'Republic',
        'episode': episode_title,
        'content': ''
    }

    # Gather all paragraphs within the current chapter
    paragraphs = chapter.find('p')
    if paragraphs:
      for paragraph in chapter.find_all('p'):
          episode_data['content'] += paragraph.get_text() + ' '  # Add space to separate paragraphs

      # Clean up the content by stripping whitespace
      episode_data['content'] = episode_data['content'].replace("\n"," ")

      # Append episode data to the list of episodes
      republic_episodes.append(episode_data)

In [None]:
len(republic_episodes)

12

In [None]:
republic_episodes = republic_episodes[2:]

In [None]:
chapter_title = [episode['episode'] for episode in republic_episodes]
republic_inds = list(range(len(ulysses_episodes) + len(dubliners_episodes) + len(dracula_episodes),len(ulysses_episodes) + len(dubliners_episodes) + len(dracula_episodes) + len(republic_episodes)))
republic_title = {chapter_title[i]:republic_inds[i] for i in range(len(republic_episodes))}
republic_title

{'BOOK I.': 60,
 'BOOK II.': 61,
 'BOOK III.': 62,
 'BOOK IV.': 63,
 'BOOK V.': 64,
 'BOOK VI.': 65,
 'BOOK VII.': 66,
 'BOOK VIII.': 67,
 'BOOK IX.': 68,
 'BOOK X.': 69}

In [None]:
republic_master = []
republic_book = []
republic_chapter = []
republic_text = []

for item in republic_episodes:
  # get master
  republic_master.append(item['master'])

  # get book number
  republic_book.append(5)

  # get chapter number
  republic_chapter.append(republic_title[item['episode']])

  # get text
  republic_text.append(item['content'])

In [None]:
print(republic_master)
print(republic_book)
print(republic_chapter)
print(len(republic_text))

['Republic', 'Republic', 'Republic', 'Republic', 'Republic', 'Republic', 'Republic', 'Republic', 'Republic', 'Republic']
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
10


In [None]:
# it is split into chapters

# split into 185

# Do train test split

# loop nodes in training

# make into 120 preserve metadata

# make into 60 preserve metadata

# loop nodes in valid and test

# make into 120

# keep all together and save






## $\color{blue}{Splitting:}$


In [None]:
from llama_index.core.node_parser import SentenceSplitter
from langchain.docstore.document import Document

# alter chunk size

n = 90

splitter = SentenceSplitter(
    chunk_size=n,
    chunk_overlap=0,
    separator='.'
)

In [None]:


ulysses_docs = []
for i in range(len(ulysses_text)):
  ulysses_nodes = splitter.split_text(ulysses_text[i])
  for node in ulysses_nodes:
    doc =  Document(page_content=node, metadata={"master":ulysses_master[i],"book_idx":ulysses_book[i], "chapter_idx":ulysses_chapter[i]})
    ulysses_docs.append(doc)

dubliners_docs = []
for i in range(len(dubliners_text)):
  dubliners_nodes = splitter.split_text(dubliners_text[i])
  for node in dubliners_nodes:
    doc =  Document(page_content=node, metadata={"master":dubliners_master[i],"book_idx":dubliners_book[i], "chapter_idx":dubliners_chapter[i]})
    dubliners_docs.append(doc)

dracula_docs = []
for i in range(len(dracula_text)):
  dracula_nodes = splitter.split_text(dracula_text[i])
  for node in dracula_nodes:
    doc =  Document(page_content=node, metadata={"master":dracula_master[i],"book_idx":dracula_book[i], "chapter_idx":dracula_chapter[i]})
    dracula_docs.append(doc)

republic_docs = []
for i in range(len(republic_text)):
  republic_nodes = splitter.split_text(republic_text[i])
  for node in republic_nodes:
    doc =  Document(page_content=node, metadata={"master":republic_master[i],"book_idx":republic_book[i], "chapter_idx":republic_chapter[i]})
    republic_docs.append(doc)

all_docs = ulysses_docs + dubliners_docs + dracula_docs + republic_docs


In [None]:
len(all_docs)

12246

In [None]:
import numpy as np
np.random.seed(0)
points = len(all_docs)
train_inds = np.random.choice(points, 11000, replace = False)
other_inds = set(range(points)) - set(train_inds)
dev_inds = np.random.choice(list(other_inds), 746, replace = False)
test_inds = np.array(list(other_inds - set(dev_inds)))

# write a function that will take the training docs
  # it will create a dictionary where the key is a chapter

In [None]:
train_docs_long = [all_docs[i] for i in range(len(all_docs)) if i in train_inds]
dev_docs_long = [all_docs[i] for i in range(len(all_docs)) if i in dev_inds]
test_docs_long = [all_docs[i] for i in range(len(all_docs)) if i in test_inds]

In [None]:
train_docs_long[4].page_content

'He peered sideways up and gave a long slow whistle of call, then paused awhile in rapt attention, his even white teeth glistening here and there with gold points. Chrysostomos. Two strong shrill whistles answered through the calm.   —Thanks, old chap, he cried briskly. That will do nicely. Switch off the current, will you?'

In [None]:
def find_index(s):
    matches = list(re.finditer(r'[\.!\?,]', s))
    if len(matches) < 2:
        return None
    penultimate_index = matches[-2].start()
    return s[:penultimate_index + 1]

def short_mid_splitter(docs, splits):
  results = []
  if len(splits) == 1:
    return docs
  else:
    for doc in docs:
      meta = doc.metadata
      text = doc.page_content
      new_text = find_index(text)
      if new_text:
        results.append(Document(page_content=new_text, metadata=meta))
  return docs + results

In [None]:
def short_mid_splitter(docs, splits):
  holder = []
  for split in splits:
    splitter = SentenceSplitter(
    chunk_size=split,
    chunk_overlap=0,
    separator='.')

    # loop through the docs.
    for doc in docs:
      meta = doc.metadata
      nodes = splitter.split_text(doc.page_content)

      # loop through nodes
      for node in nodes:
        doc = Document(page_content=node, metadata=meta)
        holder.append(doc)

  return holder


In [None]:
test_docs = short_mid_splitter(test_docs_long, [1])
dev_docs = short_mid_splitter(dev_docs_long, [1])
train_docs  = short_mid_splitter(train_docs_long, [1,1])

In [None]:
print(f'train len: {len(train_docs)}')
print(f'dev len: {len(dev_docs)}')
print(f'test len: {len(test_docs)}')

train len: 20474
dev len: 746
test len: 500


In [None]:
train_docs[555].page_content

'Mr Dedalus looked after the stumping figure and said mildly:   —The devil break the hasp of your back!   Mr Power, collapsing in laughter, shaded his face from the window as the carriage passed Gray’s statue.   —We have all been there, Martin Cunningham said broadly.   His eyes met Mr Bloom’s eyes. He caressed his beard, adding:   —Well, nearly all of us.'

## $\color{blue}{Data:}$


In [None]:
import pandas as pd
def create_df(docs):
  master = [item.metadata['master'] for item in docs]
  book_idx = [item.metadata['book_idx'] for item in docs]
  chapter_idx = [item.metadata['chapter_idx'] for item in docs]
  content = [item.page_content for item in docs]

  df = pd.DataFrame({'master': master, 'book_idx': book_idx, 'chapter_idx': chapter_idx, 'content': content})
  return df

In [None]:
train_df = create_df(train_docs)
dev_df = create_df(dev_docs)
test_df = create_df(test_docs)

In [None]:
train_df.head()

Unnamed: 0,master,book_idx,chapter_idx,content
0,Ulysses,0,0,"Halted, he peered down the dark winding stairs..."
1,Ulysses,0,0,"Then, catching sight of Stephen Dedalus, he be..."
2,Ulysses,0,0,"Stephen Dedalus, displeased and sleepy, leaned..."
3,Ulysses,0,0,he said sternly. He added in a preacher’s to...
4,Ulysses,0,0,He peered sideways up and gave a long slow whi...


## $\color{blue}{Save:}$


In [None]:
train_df.to_pickle('class/datasets/df_train_augmentation.1')
dev_df.to_pickle('class/datasets/df_dev_augmentation.1')
test_df.to_pickle('class/datasets/df_test_augmentation.1')