In [4]:
import numpy as np
import pandas as pd
import random
import spacy
import os

In [5]:
# read in the "en" docs
nlp = spacy.load('en_core_web_lg')

filename = "test"
text_dir = f"/content/shared/input/{filename}.txt"
with open(text_dir, 'r') as fin:
  text = fin.read()

print(text)

Operator Good afternoon and welcome to the BlackBerry first-quarter fiscal year 2024 results conference call. My name is Andrea, and I will be your conference moderator for today's call. During the presentation, all participants will be in a listen-only mode. We will be facilitating a brief question-and-answer session toward the end of the conference. [Operator instructions] As a reminder, this conference is being recorded for replay purposes. I would now like to turn today's call over to Tim Foote, vice president of BlackBerry investor relations. Please go ahead. Tim Foote -- Vice President, Investor Relations Thank you, Andrea. Good afternoon and welcome to BlackBerry's first-quarter 2024 earnings conference call. With me on the call today are Executive Chair and Chief Executive Officer John Chen and Chief Financial Officer Steve Rai. After I read our cautionary note regarding forward-looking statements, John will provide a business update and Steve will review the financial results.

In [8]:
# turn into data frame
df = pd.DataFrame([(token.text, token.ent_type_, token.idx) for token in doc], columns=["token", "entity", "idx"])

df

Unnamed: 0,token,entity,idx
0,Operator,,0
1,Good,,9
2,afternoon,TIME,14
3,and,,24
4,welcome,,28
...,...,...,...
8660,a,,42309
8661,disclosure,,42311
8662,policy,,42322
8663,.,,42328


In [9]:
df2 = pd.DataFrame([(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents], columns=["label", "entity", "start_idx", "end_idx"])

df2

Unnamed: 0,label,entity,start_idx,end_idx
0,TIME,afternoon,14,23
1,ORG,the BlackBerry first-quarter fiscal year 2024,39,84
2,PERSON,Andrea,121,127
3,DATE,today,173,178
4,DATE,today,473,478
...,...,...,...,...
442,ORG,The Motley Fool,41910,41925
443,ORG,SEC,42113,42116
444,ORG,Motley Fool,42254,42265
445,ORG,BlackBerry,42277,42287


In [10]:
# see how many ent is inside one label
# Count the number of entities in each label category using value_counts() function
counts = df2["label"].value_counts().reset_index()

# Rename the columns of the resulting DataFrame
counts.columns = ["label", "count"]

# Group the DataFrame by label, and aggregate the text column using a list of all the entities under the same label
grouped = df2.groupby("label")["entity"].apply(list).reset_index()

# Merge the counts DataFrame with the grouped DataFrame using the label column
choices_df = pd.merge(grouped, counts, on="label")

# Print the resulting DataFrame
print(choices_df)

          label                                             entity  count
0      CARDINAL  [10, ten, one, 10, IoT, 45 million, two, 640 m...     74
1          DATE  [today, today, first-quarter 2024, today, toda...     88
2         EVENT                                    [Q&A, PoC, PoC]      3
3           GPE  [U.S., China, Japan, Korea, Michigan, AWS, U.S...     10
4           LOC  [North America, Europe, North America, North A...      5
5         MONEY  [as much as 900 million, 0.06, $750 billion, $...      4
6          NORP                             [Canadian, SOP, Asian]      3
7       ORDINAL  [first, second, first, second, first, First, f...     14
8           ORG  [the BlackBerry first-quarter fiscal year 2024...     92
9       PERCENT  [80%, the 18% to 22%, 9%, 6%, 14%, 37%, 37%, 9...     14
10       PERSON  [Andrea, Tim Foote, Tim Foote, Andrea, John Ch...    121
11      PRODUCT  [EV, ADAS, Cylance, Cylance, Cylance, Cylance,...     11
12         TIME  [afternoon, afternoon

In [11]:
# Define a function to randomly select an entity from choices_df for a given label
def randomizer(label):
    # Select the row in the DataFrame where the label matches the input label
    row = choices_df.loc[choices_df["label"] == label]
    # Get the list of entities for the selected row
    entity_list = row["entity"].iloc[0]
    # Randomly select an entity from the list
    random_entity = np.random.choice(entity_list)
    # Return the randomly selected entity
    return random_entity

In [12]:
# Loop through the entities in df2 and replace the tokens in df with random entities from choices_df
def generate_text(n):
  result = []
  for i in range(n):
    prev_end_idx = 0
    new_text = ""
    for i, row in df2.iterrows():
        label = row["label"]
        start_idx = row["start_idx"]
        end_idx = row["end_idx"]
        entity = randomizer(label)
        df.loc[(df["idx"] >= start_idx) & (df["idx"] < end_idx), "token"] = entity
        new_text += text[prev_end_idx:start_idx] + entity
        prev_end_idx = end_idx
    new_text += text[prev_end_idx:]
    result.append(new_text)
  return result

test = generate_text(2)
print(test)

["Operator Good afternoon and welcome to Investor Relations results conference call. My name is John Chen, and I will be your conference moderator for full-year's call. During the presentation, all participants will be in a listen-only mode. We will be facilitating a brief question-and-answer session toward the end of the conference. [Operator instructions] As a reminder, this conference is being recorded for replay purposes. I would now like to turn last year's call over to John Chen, vice president of Canaccord Genuity investor relations. Please go ahead. John Chen -- Vice President, Company Thank you, -- Robert W. Baird. Good afternoon and welcome to Upstream Security's last fiscal year-end earnings conference call. With me on the call the year are Executive Chair and Chief Executive Officer QNX and Chief Financial Officer Autonomo. After I read our cautionary note regarding forward-looking statements, John Chen will provide a business update and Paul Treiber will review the financi

In [13]:
augmented_text = generate_text(100)
df = pd.DataFrame(augmented_text, columns=["transcript"])

df

Unnamed: 0,transcript
0,Operator Good afternoon and welcome to TAM res...
1,Operator Good afternoon and welcome to WEJO re...
2,Operator Good afternoon and welcome to the U.S...
3,Operator Good afternoon and welcome to TCV res...
4,Operator Good 0 minutes and welcome to The Mot...
...,...
95,Operator Good afternoon and welcome to CES res...
96,Operator Good 0 minutes and welcome to the Ger...
97,Operator Good 0 minutes and welcome to Company...
98,Operator Good afternoon and welcome to SEC res...
