In [8]:
import fitz  # PyMuPDF
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

# Step 2: Preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text

# Step 3: Fine-tune GPT-2 model
def fine_tune_gpt2(chunks):
    # Load pre-trained model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fine-tuning the model
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(1):  # Adjust number of epochs as needed
        for chunk in chunks:
            input_ids = tokenizer.encode(chunk, return_tensors='pt')
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Save the fine-tuned model
    model_path = 'fine_tuned_model.pth'
    torch.save(model.state_dict(), model_path)

    return model_path  # Return the path to the saved model

# Step 4: Answer questions based on fine-tuned model
def answer_question(model_path, question):
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load the fine-tuned model
    model.load_state_dict(torch.load(model_path))

    # Generate a response from the fine-tuned model
    input_ids = tokenizer.encode(question, return_tensors='pt')
    output = model.generate(input_ids, max_length=150, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Check if the answer is relevant to the PDF content
    if check_relevance(answer):
        return answer
    else:
        return "sorry not allowed to answer"

# Function to check if the answer is relevant
def check_relevance(answer):
    # Logic to determine if the answer is relevant (can be more sophisticated)
    relevant_keywords = ["Nissan", "2013", "EVB", "electric", "vehicle"]
    for keyword in relevant_keywords:
        if keyword.lower() in answer.lower():
            return True
    return False

# Example usage
if __name__ == "__main__":
    # Replace with your PDF file path
    pdf_path = 'EVB_Nissan_2013.pdf'

    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Step 2: Preprocess text
    preprocessed_text = preprocess_text(pdf_text)

    # Step 2.1: Chunk text into smaller parts
    text_chunks = chunk_text(preprocessed_text)

    # Step 3: Fine-tune GPT-2 model on text chunks
    model_path = fine_tune_gpt2(text_chunks)

    print(f"Model saved to: {model_path}")

    # Step 4: Answer a question based on the fine-tuned model
    question = "How to remove service plug"
    answer = answer_question(model_path, question)
    print(f"Answer: {answer}")


Epoch 1, Loss: 3.736039161682129
Epoch 1, Loss: 3.731839656829834
Epoch 1, Loss: 3.366666793823242
Epoch 1, Loss: 4.747529983520508
Model saved to: fine_tuned_model.pth


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: sorry not allowed to answer


In [34]:
import fitz  # PyMuPDF
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect  # Import langdetect for language detection

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

# Step 2: Preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text

# Step 3: Extract relevant keywords
def extract_keywords(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    relevant_words = [word.lower() for word, tag in tagged_tokens if tag.startswith('NN')]  # Selecting nouns
    return relevant_words

# Step 4: Remove stopwords and non-English words
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:  # Check if it's not a stopword
            try:
                if detect(word) == 'en':  # Check if it's English
                    filtered_words.append(word.lower())
            except:
                pass  # Skip if language detection fails
    return filtered_words

# Step 5: Fine-tune GPT-2 model
def fine_tune_gpt2(chunks):
    # Load pre-trained model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fine-tuning the model
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(1):  # Adjust number of epochs as needed
        for chunk in chunks:
            input_ids = tokenizer.encode(chunk, return_tensors='pt')
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Save the fine-tuned model
    model_path = 'fine_tuned_model.pth'
    torch.save(model.state_dict(), model_path)

    return model_path  # Return the path to the saved model

# Step 6: Answer questions based on fine-tuned model
def answer_question(model_path, question, relevant_words):
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load the fine-tuned model
    model.load_state_dict(torch.load(model_path))

    # Generate a response from the fine-tuned model
    input_ids = tokenizer.encode(question, return_tensors='pt')
    output = model.generate(input_ids, max_length=150, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Check if the answer is relevant to the PDF content
    if check_relevance(answer, relevant_words):
        return answer
    else:
        return "sorry not allowed to answer"

# Function to check if the answer is relevant
def check_relevance(answer, relevant_words):
    # Convert answer to lowercase and split into words
    answer_words = set(answer.lower().split())

    # Check if any relevant words are in the answer
    for word in relevant_words:
        if word in answer_words:
            return True
    return False

# Example usage
if __name__ == "__main__":
    # Replace with your PDF file path
    pdf_path = 'EVB_Nissan_2013.pdf'

    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Step 2: Preprocess text
    preprocessed_text = preprocess_text(pdf_text)

    # Step 3: Extract relevant keywords
    relevant_words = extract_keywords(preprocessed_text)

    # Step 4: Remove stopwords and non-English words from relevant words
    relevant_words = remove_stopwords(relevant_words)

    # Step 5: Chunk text into smaller parts
    text_chunks = chunk_text(preprocessed_text)

    # Step 6: Fine-tune GPT-2 model on text chunks
    model_path = fine_tune_gpt2(text_chunks)

    print(f"Model saved to: {model_path}")

    # Step 7: Answer a question based on the fine-tuned model
    question = "What is the range of the Nissan electric vehicle in 2013?"
    answer = answer_question(model_path, question, relevant_words)
    print(f"Answer: {answer}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1, Loss: 3.7086098194122314
Epoch 1, Loss: 3.721724271774292
Epoch 1, Loss: 3.4665377140045166
Epoch 1, Loss: 4.9815874099731445
Model saved to: fine_tuned_model.pth


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: What is the range of the Nissan electric vehicle in 2013?

The range of the Nissan electric vehicle in 2013 is the same as the range of the Nissan electric vehicle in 2012.

The range of the Nissan electric vehicle in 2012 is the same as the range of the Nissan electric vehicle in 2011.

The range of the Nissan electric vehicle in 2011 is the same as the range of the Nissan electric vehicle in 2010.

The range of the Nissan electric vehicle in 2010 is the same as the range of the Nissan electric vehicle in 2009.

The range of the Nissan electric vehicle in 2009 is the same as the range of the Nissan electric vehicle in 2008.

The range of the Nissan electric vehicle in 2008 is the


In [38]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

# Example usage after fine-tuning
if __name__ == "__main__":
    # Load fine-tuned model and tokenizer configuration
    model_path = 'fine_tuned_model.pth'  # Update with the path to your fine-tuned model .pth file
    
    # Load tokenizer configuration
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Load model configuration
    config = GPT2Config.from_pretrained('gpt2')
    model = GPT2LMHeadModel(config)

    # Load the state_dict from .pth file
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)

    # Example: Generate text
    input_text = "Do you know dancing?"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    generated = model.generate(input_ids, max_length=100, num_return_sequences=1)
    decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    print(decoded_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Do you know dancing?

I'm not sure if I'm going to be able to do it. I'm not sure if I'm going to be able to do it.

I'm not sure if I'm going to be able to do it.


I'm not sure if I'm going to be able to do it.

I'm not sure if I'm not sure if I'm going to be able to do it.

I'm not sure


In [57]:
import fitz  # PyMuPDF
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect  # Import langdetect for language detection

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

# Step 2: Preprocess text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text

# Step 3: Extract relevant keywords
def extract_keywords(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    relevant_words = [word.lower() for word, tag in tagged_tokens if tag.startswith('NN')]  # Selecting nouns
    return list(set(relevant_words))  # Remove duplicates

# Step 4: Remove stopwords, alphanumeric words, and single-letter words
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        # Check if the word is not alphanumeric and not a single letter
        if word.isalnum() and len(word) > 1:
            # Check if it's not a stopword and is in English
            #print(word)
            if word.lower() not in stop_words:
                try:
                    #if detect(word) == 'en':
                    filtered_words.append(word.lower())
                except:
                    pass  # Skip if language detection fails
    return filtered_words



# Step 5: Save relevant words to a text file
def save_words_to_file(words, file_path):
    with open(file_path, 'w') as f:
        for word in words:
            f.write(word + '\n')

# Step 6: Fine-tune GPT-2 model
def fine_tune_gpt2(chunks):
    # Load pre-trained model and tokenizer
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fine-tuning the model
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(100):  # Adjust number of epochs as needed
        for chunk in chunks:
            input_ids = tokenizer.encode(chunk, return_tensors='pt')
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Save the fine-tuned model
    model_path = 'fine_tuned_model.pth'
    torch.save(model.state_dict(), model_path)

    return model_path  # Return the path to the saved model

# Step 7: Answer questions based on fine-tuned model
def answer_question(model_path, question, relevant_words):
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load the fine-tuned model
    model.load_state_dict(torch.load(model_path))

    # Generate a response from the fine-tuned model
    input_ids = tokenizer.encode(question, return_tensors='pt')
    output = model.generate(input_ids, max_length=150, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Check if the answer is relevant to the PDF content
    if check_relevance(answer, relevant_words):
        return answer
    else:
        return "sorry not allowed to answer"

# Function to check if the answer is relevant
def check_relevance(answer, relevant_words):
    # Convert answer to lowercase and split into words
    answer_words = set(answer.lower().split())

    # Check if any relevant words are in the answer
    for word in relevant_words:
        if word in answer_words:
            return True
    return False

# Example usage
if __name__ == "__main__":
    # Replace with your PDF file path
    pdf_path = 'EVB_Nissan_2013.pdf'

    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Step 2: Preprocess text
    preprocessed_text = preprocess_text(pdf_text)

    # Step 3: Extract relevant keywords
    relevant_words = extract_keywords(preprocessed_text)

    # Step 4: Remove stopwords and non-English words from relevant words
    relevant_words = remove_stopwords(relevant_words)

    # Step 5: Save relevant words to a text file
    save_words_to_file(relevant_words, 'EVB_Nissan_2013_relevant_words.txt')

    # Step 6: Chunk text into smaller parts
    text_chunks = chunk_text(preprocessed_text)

    # Step 7: Fine-tune GPT-2 model on text chunks
    model_path = fine_tune_gpt2(text_chunks)

    print(f"Model saved to: {model_path}")

    # Step 8: Answer a question based on the fine-tuned model
    question = "What is the range of the Nissan electric vehicle in 2013?"
    answer = answer_question(model_path, question, relevant_words)
    print(f"Answer: {answer}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sauravsahu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1, Loss: 3.6604440212249756
Epoch 1, Loss: 3.769139289855957
Epoch 1, Loss: 3.4702062606811523
Epoch 1, Loss: 4.557125568389893
Epoch 2, Loss: 3.6157047748565674
Epoch 2, Loss: 3.5445468425750732
Epoch 2, Loss: 3.3429627418518066
Epoch 2, Loss: 4.5455403327941895
Epoch 3, Loss: 3.469910144805908
Epoch 3, Loss: 3.392518997192383
Epoch 3, Loss: 3.269505739212036
Epoch 3, Loss: 4.430732250213623
Epoch 4, Loss: 3.3023622035980225
Epoch 4, Loss: 3.3967339992523193
Epoch 4, Loss: 3.0991368293762207
Epoch 4, Loss: 4.393128395080566
Epoch 5, Loss: 3.267570734024048
Epoch 5, Loss: 3.2617175579071045
Epoch 5, Loss: 3.1221823692321777
Epoch 5, Loss: 4.153881072998047
Epoch 6, Loss: 3.187998056411743
Epoch 6, Loss: 3.150163412094116
Epoch 6, Loss: 2.990363121032715
Epoch 6, Loss: 4.014337539672852
Epoch 7, Loss: 3.128270387649536
Epoch 7, Loss: 3.141404151916504
Epoch 7, Loss: 2.892174482345581
Epoch 7, Loss: 3.749344825744629
Epoch 8, Loss: 3.0512983798980713
Epoch 8, Loss: 2.99066519737243

Epoch 60, Loss: 0.3464500904083252
Epoch 60, Loss: 0.2256716936826706
Epoch 60, Loss: 0.15347245335578918
Epoch 61, Loss: 0.3424589931964874
Epoch 61, Loss: 0.369672030210495
Epoch 61, Loss: 0.22874663770198822
Epoch 61, Loss: 0.21147513389587402
Epoch 62, Loss: 0.4031645357608795
Epoch 62, Loss: 0.34404587745666504
Epoch 62, Loss: 0.261968731880188
Epoch 62, Loss: 0.1698777973651886
Epoch 63, Loss: 0.3334504961967468
Epoch 63, Loss: 0.33516451716423035
Epoch 63, Loss: 0.26495832204818726
Epoch 63, Loss: 0.13095790147781372
Epoch 64, Loss: 0.3775612413883209
Epoch 64, Loss: 0.2954411208629608
Epoch 64, Loss: 0.23984883725643158
Epoch 64, Loss: 0.13975434005260468
Epoch 65, Loss: 0.2609047591686249
Epoch 65, Loss: 0.27612724900245667
Epoch 65, Loss: 0.2493302971124649
Epoch 65, Loss: 0.10014933347702026
Epoch 66, Loss: 0.29956111311912537
Epoch 66, Loss: 0.30844220519065857
Epoch 66, Loss: 0.17982858419418335
Epoch 66, Loss: 0.1683460772037506
Epoch 67, Loss: 0.28390955924987793
Epoch 6

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: What is the range of the Nissan electric vehicle in 2013? • The range of the Nissan electric vehicle in 2013 is based on the vehicle's battery pack technology. • The range of the Nissan electric vehicle in 2013 is based on the vehicle's energy storage technology. 2. Standard mode • The standard mode of operation for the Nissan electric vehicle is the high voltage battery pack test. • The standard mode of operation for the Nissan electric vehicle is the high voltage battery pack shutdown system. • Depending on type of vehicle (vehicle or battery) use the standard mode (vehicle or battery). • Depending on type of vehicle (vehicle or battery) use the shutdown system (vehicle or battery). REMOVAL Service plug Service plug retainer Battery pack


In [67]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

# Function to read relevant keywords from a text file
def read_keywords_from_file(file_path):
    with open(file_path, 'r') as f:
        keywords = [line.strip() for line in f.readlines()]
    return keywords

# Function to check if a question is relevant based on keywords
def is_question_relevant(question, relevant_keywords):
    question = re.findall(r'\b\w+\b', question.lower())
    question_tokens = question.lower().split()
    for keyword in question_tokens:
        print(keyword.lower())
        if keyword.lower() in relevant_keywords:
            print(keyword.lower())
            return True
    return False

# Example usage after fine-tuning
if __name__ == "__main__":
    # Load fine-tuned model and tokenizer configuration
    model_path = 'fine_tuned_model.pth'  # Update with the path to your fine-tuned model .pth file
    keywords_file = 'EVB_Nissan_2013_relevant_words.txt'  # Path to the relevant keywords text file
    
    # Load tokenizer configuration
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Load model configuration
    config = GPT2Config.from_pretrained('gpt2')
    model = GPT2LMHeadModel(config)

    # Load the state_dict from .pth file
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)

    # Load relevant keywords from text file
    relevant_keywords = read_keywords_from_file(keywords_file)

    # Example: Generate text
    input_text = "do you know service plug?"
    
    # Example: Check if the question is relevant
    if is_question_relevant(input_text, relevant_keywords):
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        generated = model.generate(input_ids, max_length=100, num_return_sequences=1)
        decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
        print(decoded_text)
    else:
        print("Sorry, I am not trained to answer this question.Question is not relevant.")


AttributeError: 'list' object has no attribute 'lower'