In [1]:
import pandas as pd

In [2]:
import pandas as pd
import re
import pypdf

def clean_text(text):
    # Remove numbers that appear randomly using regular expressions
    cleaned_text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numbers
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple whitespaces with a single whitespace
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing whitespaces
    return cleaned_text

def parse_pdf_paragraphs(pdf_path):
    paragraphs = []
    with open(pdf_path, "rb") as file:
        reader = pypdf.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            page_paragraphs = text.split("\n\n")
            paragraphs.extend(page_paragraphs)
    return paragraphs

def split_into_sections(paragraphs, section_size):
    sections = []
    current_section = ""
    for i, paragraph in enumerate(paragraphs):
        cleaned_paragraph = clean_text(paragraph)
        current_section += cleaned_paragraph + " "
        if (i + 1) % section_size == 0:
            sections.append(current_section.strip())
            current_section = ""
    if current_section:
        sections.append(current_section.strip())
    return sections

# Example usage
pdf_file_path = "sample-pdf.pdf"
paragraphs = parse_pdf_paragraphs(pdf_file_path)
section_size = 3
parsed_sections = split_into_sections(paragraphs, section_size)

# Convert sections to DataFrame
data = {
    "Section Number": range(len(parsed_sections)),
    "Content": parsed_sections
}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

   Section Number                                            Content
0               0  NBER WORKING PAPER SERIES THE IMPACT OF COVID-...
1               1  Most major industries faced large drops in the...
2               2  unprecedented . Conditioning on or more hours ...
3               3  Latinx business owners drops from percent to p...
4               4  References Alexander W. Bartik , Marianne Bert...
5               5  Feb. Apr Gr oup Number Number Number % Change ...
6               6  ,,,,,,,,,,,,,,,,,, Worked in Survey Week Worke...
7               7  ,,,,,,,,, African-American Latinx AsianNumber ...


In [3]:
df["Content"][1]

'Most major industries faced large drops in the number of business owners with the only exception being agriculture. Construction, restaurants, hotels and transportation all faced large declines in the number of business owners due to COVID -. Simulations reveal that the concentrations of female, black, Latinx and Asian businesses in industries hit hard by the pandemic contributed to why losses were higher for these groups than the national average loss. Overall, these first estimates of impacts of COVID - on small business es from the April CPS indicate that losses were spread across demographic groups and types of business – no group was immune to negative impacts of social distancing policy mandates and demand shifts. These results build on the findi ngs from a few previous studies of the early effects of the coronavirus on small businesses. Employer business applications as measured by the U.S. Census weekly Business Formation Statistics fell in the five weeks f rom mid -March to m

In [4]:
df.to_csv("paper.csv", index=False)

In [5]:
import featureform as ff
from featureform import local

client = ff.Client(local=True)



In [6]:
ff.register_user("featureformer").make_default_owner()

local = ff.register_local()

paper = local.register_file(
    name="paper",
    variant="quickstart",
    description="A dataset of paper sections",
    path="paper.csv"
)

In [7]:
paper_df = client.dataframe(paper)
paper_df.head()

Applying Run: dynamic_lovelace
Resource provider already registered.
Creating user default_user 
Creating user featureformer 
Creating provider local-mode 
Creating source paper  quickstart


Unnamed: 0,Section Number,Content
0,0,NBER WORKING PAPER SERIES THE IMPACT OF COVID-...
1,1,Most major industries faced large drops in the...
2,2,unprecedented . Conditioning on or more hours ...
3,3,Latinx business owners drops from percent to p...
4,4,"References Alexander W. Bartik , Marianne Bert..."


In [9]:
#feature transformation
@local.df_transformation(inputs=[paper])
def get_section_length(paper_df):
    """the average transaction amount for a user """
    paper_df["Section Length"] = paper_df["Content"].apply(lambda x: len(x))
    return paper_df

In [10]:
full_df = client.dataframe(get_section_length)
full_df.head()

Applying Run: dynamic_lovelace
Creating provider local-mode 
Creating source get_section_length  dynamic_lovelace


Unnamed: 0,Section Number,Content,Section Length
0,0,NBER WORKING PAPER SERIES THE IMPACT OF COVID-...,5229
1,1,Most major industries faced large drops in the...,7336
2,2,unprecedented . Conditioning on or more hours ...,7121
3,3,Latinx business owners drops from percent to p...,5635
4,4,"References Alexander W. Bartik , Marianne Bert...",3803


In [11]:
@local.df_transformation(inputs=[get_section_length])
def vectorize_comments(full_df):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(full_df["Text"].tolist())
    full_df["Vector"] = embeddings.tolist()
    
    return full_df

In [None]:
"""#define feature set and label
@ff.entity
class User:
    avg_transactions = ff.Feature(
        average_user_transaction[["CustomerID", "TransactionAmount"]], # We can optionally include the `timestamp_column` "Timestamp" here
        variant="quickstart",
        type=ff.Float32,
        inference_store=local,
    )
    fraudulent = ff.Label(
        transactions[["CustomerID", "IsFraud"]], variant="quickstart", type=ff.Bool
    )"""

In [None]:
"""#define training set
ff.register_training_set(
    "fraud_training", "quickstart",
    label=("fraudulent", "quickstart"),
    features=[("avg_transactions", "quickstart")],
)"""

In [12]:
PINECONE_API_KEY = "4628339c-b376-4efc-8190-fd52198e3e9b"
PINECONE_ENVIRONMENT = 'us-west1-gcp-free'
PINECONE_PROJECT_ID = 'e269605'

pinecone = ff.register_pinecone(
    name="pinecone",
    project_id = PINECONE_PROJECT_ID,
    environment = PINECONE_ENVIRONMENT,
    api_key = PINECONE_API_KEY
)

In [13]:
client.apply()

Applying Run: dynamic_lovelace
Creating provider local-mode 
Creating provider pinecone 
Creating source vectorize_comments  dynamic_lovelace


In [14]:
@ff.entity
class Section:
    section_embeddings = ff.Embedding(
        vectorize_comments[["Section Number", "Vector"]],
        dims=100,
        vector_db=pinecone,
        description="Embeddings created from sections of a paper",
        variant="v1"
    )
    sections = ff.Feature(
        get_section_length[["Section Number", "Content"]],
        type=ff.String,
        description="Sections and their original content",
        variant="v1"
    )

In [15]:
client.apply()

Applying Run: dynamic_lovelace
Creating provider local-mode 
Creating entity section 
Creating feature section_embeddings  v1
Creating feature sections  v1


In [None]:
@ff.ondemand_feature(variant="prototype")
def relevant_comments(client, params, entity):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    search_vector = model.encode(params["query"])
    res = client.nearest("section_embeddings", "v1", search_vector, k=3)
    return res

In [None]:
query_topic = 'Small Business'

In [None]:
client.apply()
client.features([("relevant_comments", "calhacks")], {}, params={"query": query_topic})

Applying Run: nervous_noyce
Creating provider local-mode 



[A

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [None]:
@ff.ondemand_feature(variant="prototype2")
def contextualized_prompt(client, params, entity):
    pks = client.features([("relevant_comments", "prototype")], {}, params=params)
    prompt = "Use the following snippets from the paper to answer the following question\n"
    for pk in pks[0]:
        prompt += "```"
        prompt += client.features([("sections", "v1")], {"section": pk})[0]
        prompt += "```\n"
    prompt += "Question: "
    prompt += params["query"]
    prompt += "?"
    return prompt

In [None]:
client.apply()
client.features([("contextualized_prompt", "prototype2")], {}, params={"query": "enterprise MLOps"})

Applying Run: nervous_noyce
Creating provider local-mode 
Creating ondemand_feature contextualized_prompt  prototype2


FeatureNotFound: Feature 'relevant_comments:prototype2' not found. Verify that the feature is registered.