In [None]:
import os
from pathlib import Path

def load_text_files(directory):
    all_docs = []
    for file in Path(directory).glob("*.txt"):
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            all_docs.append(text)
    return all_docs

docs = load_text_files("/content/")


In [None]:
 import nltk
 nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
len(docs)

35

In [None]:
from nltk.tokenize import sent_tokenize

def chunk_text(text, max_tokens=200):
    sentences = sent_tokenize(text)
    chunks, chunk = [], []
    token_count = 0

    for sentence in sentences:
        tokens = sentence.split()
        if token_count + len(tokens) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = []
            token_count = 0
        chunk.extend(tokens)
        token_count += len(tokens)

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

chunked_docs = []
for doc in docs:
    chunked_docs.extend(chunk_text(doc))


In [None]:
chunked_docs[0]

"World War II or the Second World War (1 September 1939 – 2 September 1945) was a global conflict between two coalitions: the Allies and the Axis powers. Nearly all of the world's countries participated, with many nations mobilising all resources in pursuit of total war. Tanks and aircraft played major roles, enabling the strategic bombing of cities and delivery of the first and only nuclear weapons ever used in war. World War II was the deadliest conflict in history, resulting in 70 to 85 million deaths, more than half of which were civilians. Millions died in genocides, including the Holocaust, and by massacres, starvation, and disease. After the Allied victory, Germany, Austria, Japan, and Korea were occupied, and German and Japanese leaders were tried for war crimes. The causes of World War II included unresolved tensions in the aftermath of World War I and the rises of fascism in Europe and militarism in Japan. Key events preceding the war included Japan's invasion of Manchuria in

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = embedder.encode(chunked_docs, convert_to_tensor=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import torch
from sentence_transformers.util import semantic_search

def retrieve_relevant_chunks(query, k=3):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    hits = semantic_search(query_embedding, doc_embeddings, top_k=k)
    return [chunked_docs[hit['corpus_id']] for hit in hits[0]]


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

def answer_question(question):
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)
    input_text = f"Answer in detail: question: {question} context: {context}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output = t5_model.generate(input_ids, max_length=256, num_beams=4, early_stopping=True)
    print("Context Retreived: ",context)
    return t5_tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
print(answer_question("which two cities was the atomic bomb dropped on?"))


Context Retreived:  In February 1943, Germany suffered its first major defeat, surrendering at Stalingrad. Axis forces in North Africa also surrendered, and Soviet advances continued, recapturing Kharkiv and Kiev. Allied bombing raids intensified, targeting German cities. Soviet forces reached Berlin on 21 April 1945. Hitler committed suicide on 30 April, shortly after Mussolini was executed. Germany surrendered on 7 May, and the following day was celebrated as VE Day. With Germany defeated, plans for invading Japan were underway. However, to avoid heavy casualties, U.S. President Harry Truman approved the use of atomic bombs, developed since 1942. On 6 August 1945, Hiroshima was bombed, followed by Nagasaki on 9 August. Unable to withstand the devastation, Japan surrendered on 14 August. With this, World War II came to an end. The mass bombing of cities in Europe and Asia has often been called a war crime, although no positive or specific customary international humanitarian law with 

In [None]:
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline

model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)


def answer_question(question):
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)
    input = f"Answer in detail question: {question} context: {context}"
    encoded_input = tokenizer([input],
                             return_tensors='pt',
                             max_length=512,
                             truncation=True)
    output = model.generate(input_ids = encoded_input.input_ids,
                            attention_mask = encoded_input.attention_mask)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Context Retreived: ",context)
    return output


print(answer_question("Which countries were part of the Axis?"))


Context Retreived:  In the Middle East in May, Commonwealth forces quashed an uprising in Iraq which had been supported by German aircraft from bases within Vichy-controlled Syria . Between June and July, British-led forces invaded and occupied the French possessions of Syria and Lebanon , assisted by the Free French . Axis attack on the Soviet Union (1941) Main article: Eastern Front (World War II) European theatre of World War II animation map, 1939–1945 – Red: Western Allies and the Soviet Union after 1941; Green: Soviet Union before 1941; Blue: Axis powers With the situation in Europe and Asia relatively stable, Germany, Japan, and the Soviet Union made preparations for war. With the Soviets wary of mounting tensions with Germany, and the Japanese planning to take advantage of the European War by seizing resource-rich European possessions in Southeast Asia , the two powers signed the Soviet–Japanese Neutrality Pact in April 1941. By contrast, the Germans were steadily making prepar

In [None]:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

QAtokenizer = AutoTokenizer.from_pretrained("SRDdev/QABERT-small")

QAmodel = AutoModelForQuestionAnswering.from_pretrained("SRDdev/QABERT-small")
from transformers import pipeline

def get_short_answer(q):
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)


    ask = pipeline("question-answering", model= QAmodel , tokenizer = QAtokenizer)

    result = ask(question=q, context=context)

    print(f"Answer: '{result['answer']}'")


In [None]:
print(get_short_answer("Who were the Axis Powers?"))

Device set to use cpu


Answer: 'Japan''
None
