In [1]:
import json
import requests
from pprint import pprint

In [2]:
# Llama2 Inference URI

llm_uri = "http://localhost:11434/api/generate"

INPUT_FILE__REDDIT_MED_QUES='reddit_questions.json'

PROMPT_LLM_SUMMARIZATION_1SENT = f"Please summarize the following into a single line question" \
                                 f"of approximately 20 words or less, with the title and contents"

OUTPUT_CHUNK_SIZE = 100

In [3]:
# Llama2 Data Query Format

data={
    "model": "llama2:7b",
    "prompt":"Why is the sky blue?",
    "format": "json",
    "stream": False,
    "options": {"temperature": 1, "top_p": 0.99, "top_k": 100},    
    }

In [4]:
# Sample Call to Llama2

In [5]:

print(f"Generating a sample Llama2 answer")
response = requests.post(llm_uri, json=data, stream=False)

Generating a sample Llama2 answer


In [6]:
json_data = json.loads(response.text)
pprint(json_data)

{'context': [518,
             25580,
             29962,
             3532,
             14816,
             29903,
             29958,
             5299,
             829,
             14816,
             29903,
             6778,
             13,
             13,
             11008,
             338,
             278,
             14744,
             7254,
             29973,
             518,
             29914,
             25580,
             29962,
             13,
             13,
             1576,
             14744,
             5692,
             7254,
             1363,
             310,
             263,
             27791,
             265,
             2000,
             9596,
             280,
             1141,
             14801,
             292,
             29892,
             607,
             10008,
             746,
             6575,
             4366,
             24395,
             11563,
             29915,
             29879,
             25005,
         

In [7]:
print(json_data['response'])


The sky appears blue because of a phenomenon called Rayleigh scattering, which occurs when sunlight enters Earth's atmosphere. The sunlight encounters tiny molecules of gases in the air, such as nitrogen and oxygen, which scatter the light in all directions.

Rayleigh scattering is a process that occurs when light travels through a medium, such as air or water, and interacts with the tiny particles within it. The shorter wavelengths of light (such as blue and violet) are scattered more than the longer wavelengths (such as red and orange), which is why the sky appears blue.

The reason for this is that the shorter wavelengths of light have a smaller wavelength, which means they are more easily scattered by the tiny molecules in the air. This scattering effect is known as Mie scattering, and it is the same mechanism that gives rise to the blue color of the sky.

In addition to Rayleigh scattering, there are other factors that can contribute to the color of the sky, such as the presence 

In [8]:
# Now lets use it on the medical questions

In [None]:

f = open(INPUT_FILE__REDDIT_MED_QUES)
reddit_questions_file = json.load(f)


chunks = int(len(reddit_questions_file)/OUTPUT_CHUNK_SIZE)
for c in range(chunks):
    chunk_start = c * OUTPUT_CHUNK_SIZE
    chunk_end = chunk_start + OUTPUT_CHUNK_SIZE
    output_file = f"enriched__chunk_{c}.json"
    print(f"\nProcessing chunk {c} going from {chunk_start} to {chunk_end} into file {output_file}", end="")

    chunk_cache = []
    for i,d in enumerate(reddit_questions_file[chunk_start:chunk_end]):
        try:
            enriched_content = d
            if 'title' in d and 'content' in d:
                title=d['title']
                content=d['content']
                #print(f"\tProcessing chunk {c} item {i} with id {d['id']} from forum {d['forum']}")
                print(".", end="")
                data['prompt'] = f"{PROMPT_LLM_SUMMARIZATION_1SENT} {title} {content}"
                response = requests.post(llm_uri, json=data, stream=False)
                response_j = response.json()
                if 'response' in response_j:
                    d['q_summarized_llm'] = response_j['response']
                    chunk_cache.append(d)
        
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(chunk_cache, f, ensure_ascii=False, indent=4)
        except:
            print(f"Error processing chunk {c}")


Processing chunk 0 going from 0 to 100 into file enriched__chunk_0.json....................................................................................................
Processing chunk 1 going from 100 to 200 into file enriched__chunk_1.json....................................................................................................
Processing chunk 2 going from 200 to 300 into file enriched__chunk_2.json..