In [1]:
import pandas as pd
import re
import pypdf

def clean_text(text):
    # Remove numbers that appear randomly using regular expressions
    cleaned_text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numbers
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple whitespaces with a single whitespace
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing whitespaces
    return cleaned_text

def parse_pdf_paragraphs(pdf_path):
    paragraphs = []
    with open(pdf_path, "rb") as file:
        reader = pypdf.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            page_paragraphs = text.split("\n\n")
            paragraphs.extend(page_paragraphs)
    return paragraphs

def split_into_sections(paragraphs, section_size):
    sections = []
    current_section = ""
    for i, paragraph in enumerate(paragraphs):
        cleaned_paragraph = clean_text(paragraph)
        current_section += cleaned_paragraph + " "
        if (i + 1) % section_size == 0:
            sections.append(current_section.strip())
            current_section = ""
    if current_section:
        sections.append(current_section.strip())
    return sections

# Example usage
pdf_file_path = "sample-pdf.pdf"
paragraphs = parse_pdf_paragraphs(pdf_file_path)
section_size = 3
parsed_sections = split_into_sections(paragraphs, section_size)

# Convert sections to DataFrame
data = {
    "Section Number": range(len(parsed_sections)),
    "Content": parsed_sections
}
df = pd.DataFrame(data)
df["Section Number"] = "section" + df["Section Number"].astype(str)

# Display the DataFrame
df.head()

Unnamed: 0,Section Number,Content
0,section0,NBER WORKING PAPER SERIES THE IMPACT OF COVID-...
1,section1,Most major industries faced large drops in the...
2,section2,unprecedented . Conditioning on or more hours ...
3,section3,Latinx business owners drops from percent to p...
4,section4,"References Alexander W. Bartik , Marianne Bert..."


In [2]:
df["Content"][1]

'Most major industries faced large drops in the number of business owners with the only exception being agriculture. Construction, restaurants, hotels and transportation all faced large declines in the number of business owners due to COVID -. Simulations reveal that the concentrations of female, black, Latinx and Asian businesses in industries hit hard by the pandemic contributed to why losses were higher for these groups than the national average loss. Overall, these first estimates of impacts of COVID - on small business es from the April CPS indicate that losses were spread across demographic groups and types of business – no group was immune to negative impacts of social distancing policy mandates and demand shifts. These results build on the findi ngs from a few previous studies of the early effects of the coronavirus on small businesses. Employer business applications as measured by the U.S. Census weekly Business Formation Statistics fell in the five weeks f rom mid -March to m

In [3]:
type(df["Section Number"][0])

str

In [4]:
df.to_csv("paper.csv", index=False)

In [35]:
!pip install featureform==1.9.5 -U

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement featureform==1.9.5 (from versions: 0.0.1, 0.0.7, 0.0.8, 0.0.9, 0.0.11, 0.0.12, 0.0.13, 0.0.14rc1, 0.0.14, 0.0.15, 0.0.17, 0.0.18, 0.0.19, 0.0.21, 0.0.22, 1.0.0, 1.0.1, 1.0.2, 1.0.3, 1.0.4, 1.0.5, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.1.4, 1.1.5, 1.1.6, 1.1.7, 1.1.8, 1.1.9, 1.1.10, 1.1.11rc0, 1.1.11rc1, 1.1.11rc2, 1.1.11, 1.1.12, 1.1.13rc0, 1.2.0rc0, 1.2.0rc1, 1.2.0, 1.2.1rc0, 1.2.1, 1.2.2, 1.2.3, 1.2.4, 1.2.5rc0, 1.2.5, 1.2.6rc0, 1.2.6rc1, 1.2.6, 1.3.0rc0, 1.3.0, 1.3.1rc0, 1.3.1, 1.3.2rc0, 1.3.2, 1.3.3, 1.4.0rc0, 1.4.0rc1, 1.4.0, 1.4.1rc0, 1.4.1, 1.4.2, 1.4.3, 1.4.4, 1.4.5rc0, 1.4.5, 1.4.6, 1.4.7rc0, 1.4.7rc1, 1.4.7rc2, 1.4.7rc3, 1.5.0, 1.5.1, 1.5.2, 1.5.3, 1.6.0rc0, 1.6.0, 1.6.1rc0, 1.6.1rc1, 1.6.1, 1.6.2, 1.6.3, 1.6.4, 1.7.0, 1.7.1, 1.7.2rc0, 1.7.2, 1.7.3rc0, 1.7.3rc1, 1.7.3rc2, 1.7.3rc3, 1.7.3rc4, 1.7.3rc5, 1.7.3, 1.7.4, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.9.2rc0, 1.9.2rc1, 1.9.2rc2, 1.9.2rc3, 1.9.2rc4, 1.9.2rc5, 1.9.2rc6, 1.9.2rc7, 1

In [36]:
import featureform as ff
from featureform import local

client = ff.Client(local=True)

In [6]:
ff.register_user("featureformer").make_default_owner()

local = ff.register_local()

paper = local.register_file(
    name="paper",
    variant="fix_v4",
    description="A dataset of paper sections",
    path="paper.csv"
)

In [7]:
paper_df = client.dataframe(paper)
paper_df.head()

Applying Run: affectionate_visvesvaraya
Resource provider already registered.
Creating user default_user 
Creating user featureformer 
Creating provider local-mode 
Creating source paper  fix_v4


Unnamed: 0,Section Number,Content
0,section0,NBER WORKING PAPER SERIES THE IMPACT OF COVID-...
1,section1,Most major industries faced large drops in the...
2,section2,unprecedented . Conditioning on or more hours ...
3,section3,Latinx business owners drops from percent to p...
4,section4,"References Alexander W. Bartik , Marianne Bert..."


In [8]:
#feature transformation
@local.df_transformation(inputs=[paper])
def get_section_length(paper_df):
    """the average transaction amount for a user """
    paper_df["Section Length"] = paper_df["Content"].apply(lambda x: len(x))
    return paper_df

In [9]:
full_df = client.dataframe(get_section_length)
full_df.head()

Applying Run: affectionate_visvesvaraya
Creating provider local-mode 
Creating source get_section_length  affectionate_visvesvaraya


Unnamed: 0,Section Number,Content,Section Length
0,section0,NBER WORKING PAPER SERIES THE IMPACT OF COVID-...,5229
1,section1,Most major industries faced large drops in the...,7336
2,section2,unprecedented . Conditioning on or more hours ...,7121
3,section3,Latinx business owners drops from percent to p...,5635
4,section4,"References Alexander W. Bartik , Marianne Bert...",3803


In [10]:
@local.df_transformation(inputs=[get_section_length])
def vectorize_sections(full_df):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(full_df["Content"].tolist())
    full_df["Vector"] = embeddings.tolist()
    
    return full_df

In [11]:
PINECONE_API_KEY = '4628339c-b376-4efc-8190-fd52198e3e9b'
PINECONE_ENVIRONMENT = 'us-west1-gcp-free'
PINECONE_PROJECT_ID = 'e269605'

pinecone = ff.register_pinecone(
    name="pinecone_v2",
    project_id = PINECONE_PROJECT_ID,
    environment = PINECONE_ENVIRONMENT,
    api_key = PINECONE_API_KEY
)

In [12]:
client.apply()

Applying Run: affectionate_visvesvaraya
Creating provider local-mode 
Creating provider pinecone_v2 
Creating source vectorize_sections  affectionate_visvesvaraya


In [81]:
@ff.entity
class Section:
    section_embeddings = ff.Embedding(
        vectorize_sections[["Section Number", "Vector"]],
        dims=384,
        vector_db=pinecone,
        description="Embeddings created from sections of a paper",
        variant="v6"
    )
    sections = ff.Feature(
        get_section_length[["Section Number", "Content"]],
        type=ff.String,
        description="Sections and their original content",
        variant="v6"
    )

In [91]:
@ff.ondemand_feature(variant="prototype_v6")
def relevant_sections(client, params, entity):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    search_vector = model.encode(params["query"])
    print(len(search_vector))
    res = client.nearest("section_embeddings", "v5", search_vector, k=1)
    return res

In [92]:
query_topic = 'Small Business'

In [93]:
client.apply()
client.features([("relevant_sections", "prototype_v6")], {}, params={"query": query_topic})

Applying Run: affectionate_visvesvaraya
Creating provider local-mode 
Creating ondemand_feature relevant_sections  prototype_v6
384




array([['section4']], dtype='<U8')

In [96]:
@ff.ondemand_feature(variant="test2")
def contextualized_prompt(client, params, entity):
    pks = client.features([("relevant_sections", "prototype_v6")], {}, params=params)
    prompt = "Use the following snippets from the paper to answer the following question\n"
    for pk in pks[0]:
        prompt += "```"
        prompt += client.features([("sections", "v5")], {"section": pk})[0]
        prompt += "```\n"
    prompt += "Question: "
    prompt += params["query"]
    prompt += "?"
    return prompt

In [98]:
client.apply()
client.features([("contextualized_prompt", "test2")], {}, params={"query": query_topic})

Applying Run: affectionate_visvesvaraya
Creating provider local-mode 




384




array(['Use the following snippets from the paper to answer the following question\n```References Alexander W. Bartik , Marianne Bertrand , Zoë B. Cullen , Edward L. Glaeser , Michael Luca , and Christopher T. Stanton. . “ How Are Small Businesses Adjusting to COVID- ? Early Evidence from a Sur vey. NBER W orking Paper No. w26989. Bohn, Sarah, Marisol Cuellar Mejia, and Julien Lafortune. . “The Economic Toll of COVID- on Small Business,” Public Policy Institute of California. Boston, Thomas D. . “Generating Jobs Through African American Business Development”, in J. Whitehead and C. Harris, eds . Readings in Black Political Economy (Dubuque: Kendall - Hunt) . Boston, Thomas D. . “The Role of Black -owned Businesses in Black Community Development” ed. Paul Ong, Jobs and Economic Development in Minority Communities: Realities, Challenges, and Innovation. Temple University Press Bradford, William D. . "The Wealth Dynamics of Entrepreneurship for Black and White Families in the U.S.," Revie

In [None]:
"""client.apply()
q = "What should I know about MLOps testing"
prompt = client.features([("contextualized_prompt", "calhack")], {}, params={"query": q})[0]
import openai
openai.organization = os.getenv("OPENAI_ORG", "")
openai.api_key = os.getenv("OPENAI_KEY", "")
print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])"""

In [128]:
import openai
openai.api_key = "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"

In [129]:
def get_question(class_name, topic, num_q):
  question_prompt = "Acting as a friend in my college class called " + class_name + ", " +\
  "engage with me in constructive discussion and exmplain in simple and understandable terms what " + topic +\
  " is in two sentences, and generate " + str(num_q) + " short answer question" + ("s" if num_q > 1 else "") + " to test my understanding."
  tone_prompt = "Use a friendly tone and stimulate my intellectual curiosity."
  formatting_prompt = '''
  Return the response in the following JSON format (only return JSON and do not put any plain text in the response):
    {
      "summary": <the summary>,
      "questions" : [
        <question1>,
        <question2>,
      ]
    }
    '''
  return (question_prompt + "\n" + tone_prompt, formatting_prompt)

In [101]:
def get_answer(R, answers):
  intro_prompt = "Given that you just asked the user these questions: "
  answer_prompt = "And the user responds to each question according to the following:"
  for i, ans in enumerate(answers):
    answer_prompt += "\n"
    answer_prompt += "Answer to question " + str(i) + ": "
    answer_prompt += '"' + ans + '"'
  generate_prompt = "Critically evaluate and break down the user's response in a constructive manner. For the correctness rating put either 'Correct', 'Partially Correct', or 'Incorrect'. Do not just give the right answer. Try to explore further implications of the question but keep responses to five sentences or fewer."
  tone_prompt = '''Speak from a second person perspective, address to the user as 'you' and the user's response as 'your response', and ensure your responses to each question are contained in each 'response' field.
  Don't speak in a robotic tone, speak like a friendly tutor who has a secret crush on you and be VERY conversational.
  '''
  format_prompt = '''Return the response in the following JSON format (only return JSON and do not put any plain text in the response):
  {"answers": [
    {
      "rating": <put the correctness rating to the first question here>
      "explanation": <put the response to the first question here>
    },
    {
      "rating": <put the correctness rating to the second question here>
      "explanation": <put the response to the second question here>
    }
  ]
  }'''
  prompt = intro_prompt + '\n"' + R + '"\n' + answer_prompt + "\n" + generate_prompt + '\n' + tone_prompt + '\n' + format_prompt
  return prompt


In [130]:
Q, F = get_question("Business", "Impact of COVID on Small Businesses", 3)

In [131]:
question_prompt = Q + F

In [132]:
question_prompt

'Acting as a friend in my college class called Business, engage with me in constructive discussion and exmplain in simple and understandable terms what Impact of COVID on Small Businesses is in two sentences, and generate 3 short answer questions to test my understanding.\nUse a friendly tone and stimulate my intellectual curiosity.\n  Return the response in the following JSON format (only return JSON and do not put any plain text in the response):\n    {\n      "summary": <the summary>,\n      "questions" : [\n        <question1>,\n        <question2>,\n      ]\n    }\n    '

In [133]:
client.apply()
q = question_prompt
prompt = client.features([("contextualized_prompt", "test2")], {}, params={"query": q})[0]
print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])

Applying Run: affectionate_visvesvaraya
Creating provider local-mode 
384





{
"summary": "The Impact of Covid-19 on small businesses has been severe. The number of active business owners in the US has plummeted by millions or percent over the period of February to April, and losses have been felt across nearly all industries, including those owned by African-Americans, Latinx, Asians, immigrants and women.",
"questions": [
"What data is used in the paper to analyzed the impact of Covid-19 on small businesses?",
"What were some of the major losses experienced by small business owners?",
"What are some of the implications of the early-stage losses for job losses and economic inequality?"
]
}
