In [1]:
import fitz  # PyMuPDF
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect  # Import langdetect for language detection

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

# Step 2: Preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text

# Step 3: Extract relevant keywords
def extract_keywords(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    relevant_words = [word.lower() for word, tag in tagged_tokens if tag.startswith('NN')]  # Selecting nouns
    return list(set(relevant_words))  # Remove duplicates

# Step 4: Remove stopwords, alphanumeric words, and single-letter words
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        # Check if the word is not alphanumeric and not a single letter
        if word.isalnum() and len(word) > 1:
            # Check if it's not a stopword and is in English
            #print(word)
            if word.lower() not in stop_words:
                try:
                    #if detect(word) == 'en':
                    filtered_words.append(word.lower())
                except:
                    pass  # Skip if language detection fails
    return filtered_words



# Step 5: Save relevant words to a text file
def save_words_to_file(words, file_path):
    with open(file_path, 'w') as f:
        for word in words:
            f.write(word + '\n')

# Step 6: Fine-tune GPT-2 model
def fine_tune_gpt2(chunks):
    # Load pre-trained model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fine-tuning the model
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(100):  # Adjust number of epochs as needed
        for chunk in chunks:
            input_ids = tokenizer.encode(chunk, return_tensors='pt')
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Save the fine-tuned model
    model_path = 'fine_tuned_model.pth'
    torch.save(model.state_dict(), model_path)

    return model_path  # Return the path to the saved model

# Step 7: Answer questions based on fine-tuned model
def answer_question(model_path, question, relevant_words):
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load the fine-tuned model
    model.load_state_dict(torch.load(model_path))

    # Generate a response from the fine-tuned model
    input_ids = tokenizer.encode(question, return_tensors='pt')
    output = model.generate(input_ids, max_length=150, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Check if the answer is relevant to the PDF content
    if check_relevance(answer, relevant_words):
        return answer
    else:
        return "sorry not allowed to answer"

# Function to check if the answer is relevant
def check_relevance(answer, relevant_words):
    # Convert answer to lowercase and split into words
    answer_words = set(answer.lower().split())

    # Check if any relevant words are in the answer
    for word in relevant_words:
        if word in answer_words:
            return True
    return False

# Example usage
if __name__ == "__main__":
    # Replace with your PDF file path
    pdf_path = 'EVB_Nissan_2013.pdf'

    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Step 2: Preprocess text
    preprocessed_text = preprocess_text(pdf_text)

    # Step 3: Extract relevant keywords
    relevant_words = extract_keywords(preprocessed_text)

    # Step 4: Remove stopwords and non-English words from relevant words
    relevant_words = remove_stopwords(relevant_words)

    # Step 5: Save relevant words to a text file
    save_words_to_file(relevant_words, 'EVB_Nissan_2013_relevant_words.txt')

    # Step 6: Chunk text into smaller parts
    text_chunks = chunk_text(preprocessed_text)

    # Step 7: Fine-tune GPT-2 model on text chunks
    model_path = fine_tune_gpt2(text_chunks)

    print(f"Model saved to: {model_path}")

    # Step 8: Answer a question based on the fine-tuned model
    question = "What is the range of the Nissan electric vehicle in 2013?"
    answer = answer_question(model_path, question, relevant_words)
    print(f"Answer: {answer}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1, Loss: 3.729609251022339
Epoch 1, Loss: 3.721780300140381
Epoch 1, Loss: 3.480525493621826
Epoch 1, Loss: 4.766651153564453
Epoch 2, Loss: 3.6367886066436768
Epoch 2, Loss: 3.6582865715026855
Epoch 2, Loss: 3.399221420288086
Epoch 2, Loss: 4.883695125579834
Epoch 3, Loss: 3.56687593460083
Epoch 3, Loss: 3.5015244483947754
Epoch 3, Loss: 3.2055606842041016
Epoch 3, Loss: 4.4572577476501465
Epoch 4, Loss: 3.3136086463928223
Epoch 4, Loss: 3.331923484802246
Epoch 4, Loss: 3.198685646057129
Epoch 4, Loss: 4.2721848487854
Epoch 5, Loss: 3.2078969478607178
Epoch 5, Loss: 3.2339866161346436
Epoch 5, Loss: 3.120781183242798
Epoch 5, Loss: 4.158096790313721
Epoch 6, Loss: 3.1096465587615967
Epoch 6, Loss: 3.1491217613220215
Epoch 6, Loss: 3.0062108039855957
Epoch 6, Loss: 4.000819683074951
Epoch 7, Loss: 3.1759917736053467
Epoch 7, Loss: 3.1417856216430664
Epoch 7, Loss: 2.898599863052368
Epoch 7, Loss: 3.9807677268981934
Epoch 8, Loss: 3.074336051940918
Epoch 8, Loss: 2.968492746353149

Epoch 60, Loss: 0.3732404112815857
Epoch 60, Loss: 0.25496748089790344
Epoch 60, Loss: 0.25562185049057007
Epoch 61, Loss: 0.5294745564460754
Epoch 61, Loss: 0.34152233600616455
Epoch 61, Loss: 0.29446637630462646
Epoch 61, Loss: 0.16444361209869385
Epoch 62, Loss: 0.3846643269062042
Epoch 62, Loss: 0.36722782254219055
Epoch 62, Loss: 0.27834343910217285
Epoch 62, Loss: 0.13649623095989227
Epoch 63, Loss: 0.3983486294746399
Epoch 63, Loss: 0.38634172081947327
Epoch 63, Loss: 0.24638555943965912
Epoch 63, Loss: 0.1870727837085724
Epoch 64, Loss: 0.40165674686431885
Epoch 64, Loss: 0.2930481731891632
Epoch 64, Loss: 0.24409069120883942
Epoch 64, Loss: 0.1673978567123413
Epoch 65, Loss: 0.3074607253074646
Epoch 65, Loss: 0.3528748154640198
Epoch 65, Loss: 0.23427411913871765
Epoch 65, Loss: 0.14034034311771393
Epoch 66, Loss: 0.2855995297431946
Epoch 66, Loss: 0.32198917865753174
Epoch 66, Loss: 0.2652493715286255
Epoch 66, Loss: 0.21523259580135345
Epoch 67, Loss: 0.3410589098930359
Epoc

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Answer: What is the range of the Nissan electric vehicle in 2013?

The range of the Nissan electric vehicle in 2013 was approximately 5,000 miles (9,000 km). The range of the vehicle in the United States was approximately 5,000 miles (9,000 km).

What is the electric vehicle's energy density?

The electric vehicle's energy density is the equivalent of the vehicle's weight. The energy density of an automobile is approximately 1,000 parts per million (ppm).

What is the electric vehicle's energy density in feet (m2)?

The electric vehicle's energy density in feet (m2) is approximately the same as that in the same vehicle.

What is the electric vehicle


In [71]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

# Function to read relevant keywords from a text file
def read_keywords_from_file(file_path):
    with open(file_path, 'r') as f:
        keywords = [line.strip() for line in f.readlines()]
    return keywords

# Function to check if a question is relevant based on keywords
def is_question_relevant(question, relevant_keywords):
    pattern = r'[^\w\s]'  # Matches any non-word and non-space characters
    # Replace the special characters with an empty string
    cleaned_string = re.sub(pattern, '', question)
    question_tokens = cleaned_string.lower().split()
    for keyword in question_tokens:
        #print(keyword.lower())
        if keyword.lower() in relevant_keywords:
            print(keyword.lower())
            return True
    return False

# Example usage after fine-tuning
if __name__ == "__main__":
    # Load fine-tuned model and tokenizer configuration
    model_path = 'fine_tuned_model.pth'  # Update with the path to your fine-tuned model .pth file
    keywords_file = 'EVB_Nissan_2013_relevant_words.txt'  # Path to the relevant keywords text file
    
    # Load tokenizer configuration
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Load model configuration
    config = GPT2Config.from_pretrained('gpt2')
    model = GPT2LMHeadModel(config)

    # Load the state_dict from .pth file
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)

    # Load relevant keywords from text file
    relevant_keywords = read_keywords_from_file(keywords_file)

    # Example: Generate text
    input_text = "do you know service plug?"
    
    # Example: Check if the question is relevant
    if is_question_relevant(input_text, relevant_keywords):
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        generated = model.generate(input_ids, max_length=100, num_return_sequences=1)
        decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
        print(decoded_text)
    else:
        print("Sorry, I am not trained to answer this question.Question is not relevant.")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


service
do you know service plug? • When not in use: • Tell the user if it is needed or not.

· If the user is sure to put the removed battery in pocket or pack case. • If the item was changed in transit the user must be sure to remove the battery before attempting to remove the plug.

· If the item was resold on a black or white color other than black or an equivalent product was used.

· If the item is a hybrid
