In [2]:
import pandas as pd
from peft import PeftModel
from transformers import (
    AutoTokenizer,
    GenerationConfig,
    GPTQConfig,
    AutoModelForCausalLM,
)
import torch
from torch import nn
from openai import OpenAI
import openai
import os
import time
from PyPDF2 import PdfReader
import tensorboard

from app.code.help import load_base_model

In [4]:
# %% --------------------------------------------------------------------------

#remember for this script need to havemodified mistral_modelling.py file


def load_model(base_model_path,adapters=None):

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        return_dict=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_path)

    tokenizer.pad_token = "<unk>"
    tokenizer.padding_side = "right"
    base_model.resize_token_embeddings(len(tokenizer))
    base_model.config.eos_token_id = tokenizer.eos_token_id
    base_model.config.pad_token_id = tokenizer.pad_token_id
    model = PeftModel.from_pretrained(base_model, adapters) if adapters is not None else base_model
    return tokenizer,model

class app:
    def __init__(self,base_model_path,book):
        self.book = book
        self.base_model_path = base_model_path
        self.base_model = None
        self.tokenizer = None
        self.classification_head = None
        self.causal_head = None
        self.peft_model = None
        self.page_lag = None
        self.contents_first = None
        self.contents_last = None
        self.contents = None #make pd dataframe but get last contents page first lol.

    def get_base(self):
        self.tokenizer, self.model = load_base_model(self.base_model_path)
        self.causal_head = self.model.lm_head

    def get_contents(self):
        prompt = """<s>[INST] @@@ Instructions:
It is your task to classify whether a string corresponds to the contents page of a pdf book.
A contents page includes chapter titles and page numbers.
The first word of you answer must be "Yes" or "No"
You must reply "yes" if the string is from the contents page, and "no" if it is not the contents page.

@@@ Example:
If this is the string: ### 'Contents \nIntroduction  v\nUnit 1 Business and its environment 2\n1: Enterprise  3\n2: Business structure 15\n3: Size of business 29\n4: Business objectives 38\n5: Stakeholders in a business 51\n6: Business structure (A Level) 61\n7: Size of business (A Level) 70\n8:  External in fl uences on business activity (A Level) 76\n9:  External economic in fl uences on business behaviour (A Level) 98\nUnit 2 People in organisations 124\n10: Management and leadership 125\n11: Motivation 137\n12: Human resource management 159\n13:  Further human resource management (A Level) 170\n14: Organisation structure (A Level) 187\n15: Business communication (A Level) 200\nUnit 3 Marketing 212\n16: What is marketing? 213\n17: Market research 231\n18: Th e marketing mix – product and price 252\n19:  Th e marketing mix – promotion and place 273\n20: Marketing planning (A Level) 297\n21: Globalisation and international marketing (A Level) 318iiiContents ###

The correct answer is: Yes

@@@ Example:
If this is the string: ### 'vi ABOUT THIS BOOK\nABOUT THIS BOOK\nThis book is written for students following the Pearson Edexcel International Advanced Subsidiary (IAS) \nBiology specification. This book covers the full IAS course and the first year of the International A Level  (IAL) course.\nThe book contains full coverage of IAS units (or exam papers) 1 and 2. Each unit in the specification has  \ntwo topic areas. The topics in this book, and their contents, fully match the specification. You can refer to the Assessment Overview on page x for further information. Students can prepare for the written Practical Paper (unit 3) by using the IAL Biology Lab Book (see page viii of this book).\nEach topic is divided into chapters and sections to break the content down into manageable chunks.  \nEach section features a mix of learning and activities. \nLearning objectives\nEach chapter starts with a listof key assessment objectives.\nDid you know?Interesting facts help you remember the key concepts.CheckpointQuestions at the end of each section check understanding of the key learning points in each chapter.Subject vocabularyKey terms are highlighted in blue in the text. Clear definitions are provided at the end of each section for easy reference, and are also collated in a glossary at the back of the book. Worked examples show you how to work through questions, and set out calculations.Specification referenceThe exact specification references covered in the section are provided. Exam hintsTips on how to answer exam-style questions and guidance for exam preparation. Orange Learning Tips help you focus your learning and avoid common errors. \nUncorrected proof, all content subject to change at publisher discretion. Not for resale, circulation or distribution in whole or in part. ©Pearson 2018' ###

The correct answer is: No

@@@ Question:
This is the string: ### """
        for i in range(0,30):
            input_ids = self.tokenizer(prompt+book.pages[i].extract_text()+' ### [/INST]',return_tensors='pt').input_ids.cuda()
            output = self.tokenizer.decode((self.model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=400000))[0])
            if 'yes' in output[output.find('[/INST]')+len('[/INST]'):].lower():
                self.contents_first = i
                break
        for i in range(self.contents_first,30):
            input_ids = self.tokenizer(prompt+book.pages[i].extract_text()+' ### [/INST]',return_tensors='pt').input_ids.cuda()
            output = self.tokenizer.decode((self.model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=400000))[0])
            if 'no' in output[output.find('[/INST]')+len('[/INST]'):].lower():
                self.contents_last = i-1
                break
        #could make 2 things above a funciton to reduce repetition?
        return 'found contents page .. hopefully'



    def extract_contents(self):
        contents_pages = "" #if method too long can feed separate prompts to LLM for each page of contents page.
        for i in range(self.contents_first,self.contents_last+1):
            contents_pages += self.book.pages[i].extract_text()

        prompt = """<s>[INST] @@@ Instructions:
It is your task to extract the chapters and corresponding page numbers from a string which was created from the contents page of a pdf book.
You must return a list of the chapters and page numbers.
Put each chapter and its page number on its own line, and separate chapters titles from page numbers with a "---".
For example the first 2 chapters of a contents page should be in the following format: "chapter 1 title --- chapter 1 page number \n chapter 2 title --- chapter 2 page number"

@@@ Question:
string which was created from the contents page of a pdf book: ### """

        input_ids = self.tokenizer(prompt+book.pages[i].extract_text()+' ### [/INST]',return_tensors='pt').input_ids.cuda()
        output = self.tokenizer.decode((self.model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=400000))[0])
        return output[output.find('[/INST]')+len('[/INST]'):].lower()
    
    ##result of this above needs to be a pandas data fram with chapters and page numbres
    #with columns chapter_titles and page_numbers
    #and remove nay that arent numberes paged

    def prep_causal_model(self):
        self.model.num_labels = 2
        self.model.config.num_labels = 2

        self.model.score = nn.Linear(self.model.config.hidden_size, self.model.config.num_labels, bias=False,dtype=torch.float16) #bias is false as finetuned with and without and made no diff so saves memory

        self.model.lm_head = None #replaced LM_head with score which maps to our 2 classes
        self.model.config.lm_head = None  #remove mpapping to vocab size

        self.peft_model = PeftModel.from_pretrained(self.model, r'../adapters/firstpage') #combine with adapters

        return None

    def find_first_page(self):
        
        for i in range(self.contents_last+1,self.contents_last+30): #gonna loop through pages until find first page of chapter
            #each iteration will do a forward pass through the model with adapters with page i book
            prompt = f"""<s>[INST] This is a string from a page of a pdf book: ### {self.book.pages[i].extract_text()} ###
            Is it true or false that this page belongs to a chapter called: ### {self.contents['chapter_titles'][0]} ###? [/INST]"""
            input_ids = self.tokenizer(prompt,return_tensors='pt').input_ids.to("cuda:0")
            input_ids = input_ids.to("cuda:0") #making sure not on CPU as causes error

            with torch.no_grad(): #no grad as dont want to change any weights just doing an inference
                predicted_class = torch.argmax(
                    self.peft_modelmodel(input_ids).logits
                    ).item() # the inner bit of argmax just gets the logits, we then take max value to get predicted class
                #no need for softmax as only 2 classes

            if predicted_class == 1:
                self.contents_first = i
                break

        #could make 2 things above a funciton to reduce repetition?
        return 'found contents page .. hopefully'











In [5]:
base_model_path = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
adapters = 'add'
book = PdfReader(r"drive/MyDrive/edexcel_a_level_physics_student_book_1.pdf")
tokenizer,model = load_model(base_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [49]:
retrieve = retriever(book,model,tokenizer)
#retrieve.get_contents()

In [50]:
retrieve.contents_first=4
retrieve.contents_last =4
print(retrieve.extract_contents())

 chapter 1: introduction --- iv
chapter 2: get the most from this book --- vi
chapter 3: introduction ---
chapter 1: quantities and units --- 1
chapter 2: practical skills --- 10

chapter 3: mechanics
chapter 3.1: rectilinear motion --- 25
chapter 3.2: momentum --- 44
chapter 3.3: forces --- 55
chapter 3.4: work, energy and power --- 73

chapter 4: electric circuits
chapter 4.1: charge and current --- 90
chapter 4.2: potential difference, electromotive force and power --- 102
chapter 4.3: current–potential difference relationships --- 116
chapter 4.4: resistance and resistivity --- 128
chapter 4.5: internal resistance, series and parallel circuits, and the potential divider --- 145

chapter 5: materials
chapter 5.1: fluids --- 173
chapter 5.2: solid materials --- 188

chapter 6: waves and the particle behaviour of light
chapter 6.1: nature of waves --- 210
chapter 6.2: transmission and reflection of waves --- 221
chapter 6.3: superposition of waves --- 247
chapter 6.4: particle nature 

In [28]:
retrieve.contents_first

4

In [29]:
retrieve.contents_last

4

In [27]:
from auto_gptq import exllama_set_max_input_length
model = exllama_set_max_input_length(model, max_input_length=6000)

In [31]:
title = 'Quantities and units'
string = '1 1.1 Physical quantities, base and derived units \n1.1 Physical quantities, base \nand derived units\nAn elderly physicist was asked how much he had in the bank.\n‘How much of what?’ he responded.\n‘Money, of course!’\n‘Fifteen million, three hundred thousand, one hundred and four,’ he replied.\nThe physicist was not a rich man. He had quoted his balance in Turkish \nLira which, at that time, had an exchange rate of 2.6 million to the pound \n(1.4 million to the American dollar).\nThe story has relevance to measurements in physics. It is meaningless to state that \nthe size of a wire is 10; we must state the quantity that is measured (in this case,\xa0the \nlength of the wire) and the unit (such as cm). In this chapter, and throughout this \nbook, you will identify and use a number of base quantities  (and their units) \nthat are fundamental to all physical measurements. You will develop and use \nderived units for quantities for a wide range of physical properties.Prior knowledge\nIn this chapter you will need to:\n/uni279C use common measures and simple compound measures such as speed \n/uni279C substitute values into formulae and equations using appropriate units and \nrearrange equations in order to change the subject. \nThe key facts that will be useful are:\n/uni279C mass, length and time are examples of measurable physical quantities\n/uni279C kilogram (kg), metre (m) and second (s) are units of mass, length and time\n/uni279C speed is the distance covered per unit time and is measured in metres per \nsecond (m s–1)\n/uni279C vector quantities have both size and direction. 1 Quantities and units\nTest yourself on prior knowledge\n1 A man walks 1.6 km in 20 minutes. Calculate his average speed.\n2 A sprinter runs 200 m at an average speed of 8.0 m s –1. Calculate the \ntime taken for her to complete the distance.\n3 Acceleration can be calculated by dividing a change in speed by the \ntime taken. State the unit of acceleration.\n4 Speed is a scalar quantity and velocity is a vector. Explain the \ndifference between the two.\nTip\nMany students lose marks in \nexaminations by failing to include the \nunit of a derived quantity! Always show \nthe unit for all calculated quantities.\n807527_C01_Edexcel_Physics_001-009.indd   1 24/02/2015   14:06www.ebook3000.com\n'
prompt = f"""<s>[INST] @@@ Instructions:
You are an assisstant who must classify whether a string from a page of a pdf book corresponds to the first page of a given chapter in that book.
You will be given the string and also given the chapter title.
The first page of a chapter usually contains the name of the chapter towards the start of the string.
The first word of you answer must be "Yes" or "No"
You must reply "yes" if the string is the first page of the given chapter, and "no" if it is not the first page of the given chapter.

@@@ Example:
User: The given chapter title is ### The Chemistry Of Life ### and the string is: ### 'CHEMISTRY FOR BIOLOGISTS 61A.1 THE CHEMISTRY OF LIFE\nTHE CHEMISTRY OF WATER\nAll reactions in living cells take place in water. Without water, \nsubstances could not move around the body. Water is one of  the reactants in the process of  photosynthesis, on which almost all life depends (see fig E). Understanding the properties of  water will help you understand many key systems in living organisms. \nWater is also a major habitat – it supports more life than any other \npart of  the planet.\n▲ fig E  W ater is vital for life on Earth in many different ways – in a desert, \nthe smallest amount of water allows plants to grow.\nThe simple chemical formula of  water is H2O. This tells us that \ntwo atoms of  hydrogen are joined to one atom of  oxygen to make \nup each water molecule. However, because the electrons are held closer to the oxygen atom than to the hydrogen atoms, water is a polar molecule (see fig F).\n104.5°Oδ2\nHδ1Hδ1\n▲ fig F  A model of a w ater molecule showing dipoles.\nOne major effect of  this polarity is that water molecules form hydrogen  bonds. The slightly negative oxygen atom of  one water \nmolecule will attract the slightly positive hydrogen atoms of  other water molecules in a weak electrostatic attraction called a hydrogen \nbond. Each individual hydrogen bond is weak but there are many of  them so the molecules of  water ‘stick together’ more than you might expect (see fig G). Water has relatively high melting and boiling points compared with other substances that have molecules of  a similar size because it takes a lot of  energy to break all the hydrogen bonds that hold the molecules together. Hydrogen bonds are important in protein structure (see Sections 1A.5 and 2B.1) \nand in the structure and functioning of  DNA (see Section 2B.3).Oδ2\nOδ2\nHδ1Hδ1Hδ1Hδ1\nHδ1Oδ2Oδ2\nOδ2Hδ1\nHδ1\nHδ1Hδ1Hδ1\n▲ fig G  Hydr ogen bonding in water molecules, based on attraction \nbetween positive and negative dipoles.\nTHE IMPORTANCE OF WATER\nThe properties of  water make it very important in biological \nsystems for many reasons.\n •W ater is a polar solvent. Because it is a polar molecule, many ionic \nsubstances like sodium chloride will dissolve in it (see fig H).  \nMany covalently bonded substances are also polar and will dissolve in water, but often do not dissolve in other covalently bonded solvents such as ethanol. Water also carries other substances, such as starch. As a result, most of  the chemical reactions within cells occur in water (in aqueous solution).\nsodium and chloride ionsin solution in water\nsalt and water mixed\nsodium chloride\nNaClionic bond sodium ionchlorideionδ1 chargeson hydrogenin water areattracted tonegativechloride ion\nδ2\n charges\non oxygenin water ar\ne\nattracted tothe positivesodium ionH\nH\nHHH\nH HO\nOO\nCl2\nCl2\nCl2Cl2\nCl2\nNa1OH\nCl2\nNa1Na1\nCl2Cl2\nCl2\nCl2Cl2\nNa1Na1Na1 Na1\nNa1Na1Na1\n▲ fig H  A model of sodium chloride dissolving in water as a result of the \ninteractions between the charges on sodium and chloride ions and the dipoles of the water molecules.\nUncorrected proof, all content subject to change at publisher discretion. Not for resale, circulation or distribution in whole or in part. ©Pearson 2018' ###
Assisstant: "Yes"

@@@ Example:
User: The given chapter title is ### Preparing For Your Exams ### and the string is: ### 'xASSESSMENT OVERVIEW\nPAPER / UNIT 1PERCENTAGE \nOF IASPERCENTAGE OF IALMARK TIME AVAILABILITY\nMOLECULES, DIET, TRANSPORT AND \nHEALTH \nWritten examination\nPaper code \nWBI11/01\nExternally set and marked by \nPearson Edexcel\nSingle tier of entry40% 20% 80 1 hour  \n30 minutesJanuary, June and October\nFirst assessment : January 2019\nPAPER / UNIT 2PERCENTAGE \nOF IASPERCENTAGE OF IALMARK TIME AVAILABILITY\nCELLS, DEVELOPMENT, BIODIVERSITY \nAND CONSERVATION\nWritten examination\nPaper code \nWBI12/01\nExternally set and marked by \nPearson Edexcel\nSingle tier of entry40% 20% 80 1 hour  \n30 minutesJanuary, June and October\nFirst assessment : June 2019\nPAPER / UNIT 3PERCENTAGE \nOF IASPERCENTAGE OF IALMARK TIME AVAILABILITY\nPRACTICAL SKILLS IN BIOLOGY 1   \nWritten examination\nPaper code \nWBI13/01\nExternally set and marked by \nPearson Edexcel\nSingle tier of entry20% 10% 50 1 hour  \n20 minutesJanuary, June and October\nFirst assessment : June 2019ASSESSMENT OVERVIEW\nThe following tables give an overview of the assessment for Pearson Edexcel International Advanced Subsidiary course \nin Biology. You should study this information closely to help ensure that you are fully prepared for this course and know exactly what to expect in each part of the examination. More information about this qualification, and about the question types in the different papers, can be found on page 302 of this book.\nUncorrected proof, all content subject to change at publisher discretion. Not for resale, circulation or distribution in whole or in part. ©Pearson 2018' ###
Assisstant: "No"

@@@ Question:
User: The given chapter title: ### {title} ### This is the string: ### {string} ###
[/INST]"""


input_ids = tokenizer(prompt,return_tensors='pt').input_ids.cuda()
output = tokenizer.decode((model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=5))[0])
output[output.find('[/INST]')+len('[/INST]'):].lower()

' yes</s>'