In [25]:
import requests
import pandas as pd

url = "https://api.perplexity.ai/chat/completions"

def get_prompt(sentence, word_length):
    prompt = f"""You are an expert sentence completion bot. I will provide you with incomplete sentences. Your job is to complete these sentences in {word_length} words. Also, the output should just be the remaining part of the sentence and not the entire sentence. I am providing you with a few examples of input and expected output. Example 1: 
    input: The rain was
    output: going to flood the entire city
    Example 2: 
    input: The party was about to end after
    output: the birthday cake was distributed
    Example 3:
    input: Jack fought with him because
    output: he was insecure and jealous
    Now it is your turn, complete this sentence and provide me only the remaining part of the sentence: """    

    prompt += sentence
    
    return prompt

def main(prompt, temperature, top_p, top_k, max_output_tokens, percentage_ai_content):
    payload = {
        "model": "llama-3.1-70b-instruct",
        "messages": [
            {
                "role": "system",
                "content": "Complete this sentence."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": temperature,
        "top_p": top_p,
        "return_citations": True,
        "search_domain_filter": ["perplexity.ai"],
        "return_images": False,
        "return_related_questions": False,
        "search_recency_filter": "month",
        "top_k": top_k,
        "stream": False,
        "presence_penalty": 0,
        "frequency_penalty": 1
    }
    headers = {
        "Authorization": f"Bearer $add_api_key",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    return response.text

In [26]:
import pandas as pd
import time
import json

# Load the original data
df = pd.read_csv(f"../data/test_corpus/test_original.csv")
temperature = 0.2
top_p = 0.9
top_k = 0
max_output_tokens = 8192
word_list= ["5", "10", "15", "20"]
percentage_ai_count = "1-2 lines"
# temperature_list = [0.2, 0.4, 0.6, 0.8, 1]
# List of top_p values to process
# top_p_list = [0.8, 0.6, 0.4, 0.2]

# Counter to track when to sleep
request_count = 0

# Iterate over each top_p value
for word in word_list:
    print(f"Processing word value: {word}")
    
    # Create a copy of the original dataframe to avoid overwriting
    df_copy = df.copy()
    count = 0
    
    for index, row in df_copy.iterrows():
        prompt = get_prompt(row['Xi'], word)  # Get the prompt from 'Xi'
        
        # Replace newlines with spaces and remove double quotes
        main_res= main(prompt, temperature, top_p, top_k, max_output_tokens, percentage_ai_count)
        response_g = json.loads(main_res)
        response = response_g["choices"][0]["message"]["content"].replace('\n', ' ').replace('"', '')
        print(f"Incomplete Sentence {count}: {row['Xi']}")
        print(f"Complete Sentence {count}: {response}")
        count += 1

        # Update the dataframe with the results
        df_copy.at[index, 'Xj'] = response  # Store the actual response, not the list
        df_copy.at[index, 'model'] = 'gemini-1.5-flash'
        df_copy.at[index, 'temperature'] = temperature
        df_copy.at[index, 'top_p'] = top_p
        df_copy.at[index, 'top_k'] = top_k
        df_copy.at[index, 'max_output_tokens'] = max_output_tokens
        df_copy.at[index, 'percentage_ai_count'] = word

        request_count += 1  # Increase the request count

    # Save the dataframe to a CSV specific to this top_p value
    df_copy.to_csv(f"../data/test_corpus/llama_3.1/ai_generated_content/num_of_words{word}.csv", index=False)
    print(f"Saved CSV for word: {word}")


Processing word value: 5
Incomplete Sentence 0: Thequestionofhowhuman decision-makersdeterminetheb
Complete Sentence 0: est course of action arises
Incomplete Sentence 1: informed by na turalistic p riors . The agent m ig
Complete Sentence 1: hates the new policy changes
Incomplete Sentence 2: Classification problems are central to many applications of machine learning, including
Complete Sentence 2: natural language processing and computer vision
Incomplete Sentence 3: Lifelong learning policies aim to create a skilled workforce capable of adapting to the demands
Complete Sentence 3: of a rapidly changing economy
Incomplete Sentence 4: Bayesian methods are widely used in machine
Complete Sentence 4: learning and artificial intelligence applications
Saved CSV for word: 5
Processing word value: 10
Incomplete Sentence 0: Thequestionofhowhuman decision-makersdeterminetheb
Complete Sentence 0: est course of action is still debated
Incomplete Sentence 1: informed by na turalistic p riors . 

In [19]:
import json

incompleted_sentences = []
for index, row in df.iterrows():
    prompt = get_prompt(row['Xi'])
    g = json.loads(perplexity_model(prompt))
    incompleted_sent = g["choices"][0]["message"]["content"]
    incompleted_sentences.append(incompleted_sent)

    print(f"Incomplete : {row['Xi']}")
    print(f'Completed: {incompleted_sent}')

Incomplete : Thequestionofhowhuman decision-makersdeterminetheb
Completed: est course of action in complex situations has puzzled philosophers and scientists for centuries
Incomplete : informed by na turalistic p riors . The agent m ig
Completed: hts have been influenced by the environment and past experiences
Incomplete : Classification problems are central to many applications of machine learning, including
Completed: natural language processing, image recognition, and recommender systems.
Incomplete : Lifelong learning policies aim to create a skilled workforce capable of adapting to the demands
Completed: of a rapidly changing global economy and technological advancements.
Incomplete : Bayesian methods are widely used in machine
Completed: learning and artificial intelligence to make predictions and classify data.


In [20]:
# Adding column Xj
df['Xj'] = incompleted_sentences

# Adding column model
df['model'] = 'llama-3.1-70b-instruct'

df.head()

df.to_csv("llama_3.1_dataset.csv")

In [21]:
import pandas as pd
import json

# Load your dataframe (assuming you have already read the CSV)
df = pd.read_csv('../data/research_corpus/cleaned_Corpus.csv')

# Initialize a list to store the LLM responses for the 'Xj' column
incompleted_sentences = []
checkpoint_interval = 100  # Save after every 100 rows
start_row = 0  # Change if restarting from a certain row

for index, row in df.iterrows():
    if index < start_row:
        continue  # Skip rows if restarting from a checkpoint

    # Get the prompt from the 'Xi' column
    prompt = get_prompt(row['Xi'])

    # Call the model and get the response (modify according to your LLM function)
    response = perplexity_model(prompt)
    g = json.loads(response)
    incompleted_sent = g["choices"][0]["message"]["content"]

    # Append the LLM response for 'Xj' column
    incompleted_sentences.append(incompleted_sent)
    print(f"Record {index+1}")
    print(f"Incomplete: {row['Xi']}")
    print(f"Completed: {incompleted_sent}")

    # Add response to 'Xj' column and model name to 'model' column
    df.at[index, 'Xj'] = incompleted_sent
    df.at[index, 'model'] = 'llama-3.1-70b-instruct'

    # Save checkpoint after every 100 rows
    if (index + 1) % checkpoint_interval == 0:
        df.to_csv(f"llama_3.1_dataset_checkpoint_{index+1}.csv", index=False)
        print(f"Checkpoint saved after processing {index+1} rows.")

# Save the final dataframe to CSV
df.to_csv("llama_3.1_dataset_final.csv", index=False)
print("Final dataset saved.")


KeyError: 'Xi'