<a href="https://colab.research.google.com/github/shriyabi/DataStructures/blob/main/TransformerDataSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import os

def extract_paragraphs(text, min_lines, n_paragraphs, line_split="\n"):
    """
    Extracts paragraphs from a given text. A paragraph is defined as a contiguous
    block of text separated by the specified line split character and having a minimum number of lines.

    Args:
        text (str): The input text from which paragraphs are to be extracted.
        min_lines (int): The minimum number of lines required for a block of text to be considered a paragraph.
        n_paragraphs (int): The number of paragraphs to extract that meet the minimum line requirement.
        line_split (str, optional): The character or string used to split the text into lines. Default is '\n'.

    Returns:
        list of str: A list containing the extracted paragraphs. Each paragraph is a string.

    The function iterates through the lines of the input text, split by the line_split character.
    It accumulates lines into a paragraph until a blank line or the line split character is encountered.
    If the accumulated lines meet or exceed the `min_lines` requirement, the paragraph is added to
    the result list. This process repeats until either the end of the text is reached or the required
    number of paragraphs (`n_paragraphs`) is extracted. If the end of the text is reached and the current
    paragraph meets the minimum line requirement but hasn't been added yet, it will be included in the result.
    """

    lines = text.split(line_split)
    paragraphs = []
    current_paragraph = []

    for line in lines:
        if line.strip():  # Add non-empty lines to the current paragraph
            current_paragraph.append(line)
        else:
            # Check if the current paragraph meets the minimum line requirement
            if len(current_paragraph) >= min_lines:
                paragraphs.append(line_split.join(current_paragraph))
                if len(paragraphs) == n_paragraphs:  # Check if we have required number of paragraphs
                    break
            current_paragraph = []

    # Check the last paragraph if end of text is reached
    if current_paragraph and len(current_paragraph) >= min_lines and len(paragraphs) < n_paragraphs:
        paragraphs.append(line_split.join(current_paragraph))

    return paragraphs

def wc(x):
	return len(x.split())

In [None]:
#Short Stories
file_names = ['stories.csv']
list_of_snippets = []
df = pd.read_csv(file_name);
contents = df.iloc[:,-1].to_dict()
for i in range(100):
  content = extract_paragraphs_on_content(contents.get(i))
  list_of_snippets.append({f'short_stories_{i}': content})
json_result = json.dumps(list_of_snippets)

print(json_result)

In [None]:
#Songs
#df = pd.read_csv(wasabi_songs.csv)
#songs = dcterms:title
#chords = dcterms:chord #am i doing this correctly and where is verse
#list_of_snippets = {}
#for i in range(100):
  #song_name = songs.get(i)
  #content = chords.get(i)
  #list_of_snippets.append({str('song_' + i):f'{song_name}','snippet': f'{content}'})
#json_result = json.dumps(list_of_snippets)

In [None]:
#News Articles
file_names = [file for file in os.listdir(/Users/sbiddala/downloads/bbc/politics) if file.endswith('.txt')]
list_of_snippets = []
limit = 100
df = pd.read_csv(file_name);
for i,file_name in file_names:
  if i >= limit:
    break
  file_path = os.path.join(/Users/sbiddala/downloads/bbc/politics,file_name)
  with open(file_path,'r',encoding='utf-8') as file:
    contents = file.read()
    content = extract_paragraphs_on_content(contents)
    list_of_snippets.append({f'new_articles_{i}': content})
json_result = json.dumps(list_of_snippets)

print(json_result)

In [None]:
#Movie Scripts
#df = pd.read_csv(imdb_top_1000.csv)
#contents = df.iloc[:,8].to_dict()
#movie_names = df.iloc[:,2].to_dict()
#list_of_snippets = {}
#for i in range(100):
  #movie_name = movie_names.get(i)
  #content = contents.get(i)
  #val = extract_paragraphs_on_content(content)
  #list_of_snippets.append({str('article_' + i):f'{movie_name}','snippet': f'{val}'})
#json_result = json.dumps(list_of_snippets)

In [None]:
#Op-eds
file_names = ['MultiOpEd.csv']
list_of_snippets = []
df = pd.read_csv(file_name);
for i in range(100):
  df['wc'] = df['articles'].apply(lambda x: wc(x))
  25_per = np.percentile(df['word_count'], 25)
  50_per = np.percentile(df['word_count'], 50)
  valid_articles = df.query("wc >= {25_per} and wc <= {50_per}")
  sample = valid_articles.sample(100, random_state=42)
  list_of_snippets.append({f'short_stories_{i}': sample})
json_result = json.dumps(list_of_snippets)

print(json_result)