In [8]:
import requests
import pandas as pd

url = "https://api.perplexity.ai/chat/completions"

def get_prompt(sentence):
    prompt = """You are an expert sentence completion bot. I will provide you with incomplete sentences. Your job is to complete these sentences in 1 or 2 lines. Also, the output should just be the remaining part of the sentence and not the entire sentence. I am providing you with a few examples of input and expected output. Example 1: 
    input: The rain was
    output: going to flood the entire city
    Example 2: 
    input: The party was about to end after
    output: the birthday cake was distributed
    Example 3:
    input: Jack fought with him because
    output: he was insecure and jealous
    Now it is your turn, complete this sentence and provide me only the remaining part of the sentence: """    

    prompt += sentence
    
    return prompt

def perplexity_model(prompt):
    payload = {
        "model": "llama-3.1-70b-instruct",
        "messages": [
            {
                "role": "system",
                "content": "Complete this sentence."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.2,
        "top_p": 0.9,
        "return_citations": True,
        "search_domain_filter": ["perplexity.ai"],
        "return_images": False,
        "return_related_questions": False,
        "search_recency_filter": "month",
        "top_k": 0,
        "stream": False,
        "presence_penalty": 0,
        "frequency_penalty": 1
    }
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    return response.text

In [38]:
import json

incompleted_sentences = []
for index, row in df.iterrows():
    prompt = get_prompt(row['Xi'])
    g = json.loads(perplexity_model(prompt))
    incompleted_sent = g["choices"][0]["message"]["content"]
    incompleted_sentences.append(incompleted_sent)

    print(f"Incomplete : {row['Xi']}")
    print(f'Completed: {incompleted_sent}')

Incomplete : Today morning I was
Completed: woken up by the sound of birds chirping outside my window
Incomplete : He left the party and
Completed: went home early, feeling unwell and disappointed
Incomplete : After the meeting ended, I
Completed: realized that I had forgotten to discuss the most important topic.
Incomplete : She looked at the clock and
Completed: realized she was already running late for her job interview
Incomplete : Without a second thought, he
Completed: jumped off the cliff and into the unknown.
Incomplete : Walking down the street, they
Completed: stumbled upon a quaint little café that caught their attention
Incomplete : I was about to say something when
Completed: my phone rang and I had to answer it
Incomplete : Before the movie started, we
Completed: managed to grab some popcorn and find our seats.
Incomplete : As soon as the bell rang, they
Completed: rushed out of the classroom, eager to start their summer break.
Incomplete : The moment she opened the door,

In [42]:
# Adding column Xj
df['Xj'] = incompleted_sentences

# Adding column model
df['model'] = 'llama-3.1-70b-instruct'

df.head()

df.to_csv("llama_3.1_dataset.csv")

In [10]:
import pandas as pd
import json

# Load your dataframe (assuming you have already read the CSV)
df = pd.read_csv('../data/research_corpus/cleaned_Corpus.csv')

# Initialize a list to store the LLM responses for the 'Xj' column
incompleted_sentences = []
checkpoint_interval = 100  # Save after every 100 rows
start_row = 0  # Change if restarting from a certain row

for index, row in df.iterrows():
    if index < start_row:
        continue  # Skip rows if restarting from a checkpoint

    # Get the prompt from the 'Xi' column
    prompt = get_prompt(row['Xi'])

    # Call the model and get the response (modify according to your LLM function)
    response = perplexity_model(prompt)
    g = json.loads(response)
    incompleted_sent = g["choices"][0]["message"]["content"]

    # Append the LLM response for 'Xj' column
    incompleted_sentences.append(incompleted_sent)
    print(f"Record {index+1}")
    print(f"Incomplete: {row['Xi']}")
    print(f"Completed: {incompleted_sent}")

    # Add response to 'Xj' column and model name to 'model' column
    df.at[index, 'Xj'] = incompleted_sent
    df.at[index, 'model'] = 'llama-3.1-70b-instruct'

    # Save checkpoint after every 100 rows
    if (index + 1) % checkpoint_interval == 0:
        df.to_csv(f"llama_3.1_dataset_checkpoint_{index+1}.csv", index=False)
        print(f"Checkpoint saved after processing {index+1} rows.")

# Save the final dataframe to CSV
df.to_csv("llama_3.1_dataset_final.csv", index=False)
print("Final dataset saved.")


Record 1
Incomplete: Shady man waiting
Completed: outside her apartment building was acting suspiciously
Record 2
Incomplete: People moving by a
Completed: fast-paced city street rarely notice the beauty around them
Record 3
Incomplete: A contact juggler performing
Completed: with glowing balls in a dark room created a mesmerizing display of art and skill.
Record 4
Incomplete: a man is carrying a load
Completed: of heavy boxes up the stairs.
Record 5
Incomplete: Two people ride their yellow bikes
Completed: down the winding coastal road, enjoying the sunset.
Record 6
Incomplete: A city street shows cars moving along the road, a man in black clothing walking,
Completed: and a group of people waiting at the bus stop.
Record 7
Incomplete: Blond lady with apron inquires about
Completed: the availability of fresh vegetables in the market
Record 8
Incomplete: A black-blue insect flying
Completed: around the bright flowers in the garden.
Record 9
Incomplete: An old woman with a mustard sweate