In [10]:
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains import SimpleSequentialChain
import pandas as pd
import openai
import json
import PyPDF2

The steps to create the dataset will be:

1. Fetch any pdf
2. Select a random page from that pdf
3. Extract all the facts for that page
4. Create flashcards for each of the facts extracted
5. Output the result

In [2]:
# Saves the output from the prompt engineering experiments to a JSON file
def save_output(text_input, prompt_candidates, output, model, approach):
    data = {
        "text_input": text_input,
        "prompt_candidates": prompt_candidates,
        "output": output,
        "model": model,
        "approach": approach
    }

    with open('output.json', 'a') as file:
        json.dump(data, file)
        file.write('\n')

In [25]:
import PyPDF2

pdf_path = "data/XGBoost with Python Gradient Boosted Trees with XGBoost and scikit-learn (Jason Brownlee) (z-lib.org).pdf"
start_page = 10
end_page = 15

with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    text = ""
    for page_number in range(start_page - 1, end_page):
        page = pdf_reader.pages[page_number]
        text += page.extract_text()

print(text)

1.2. Requirements For This Book 4
1.2 Requirements For This Book
1.2.1 Python and SciPy
You do not need to be a Python expert, but it would be helpful if you knew how to install and
setup Python and SciPy. The tutorials assume that you have a Python and SciPy environment
available. This may be on your workstation or laptop, it may be in a VM or a Docker instance
that you run, or it may be a server instance that you can congure in the cloud as taught in
Part III of this book.
Technical Requirements : The technical requirements for the code and tutorials in this
book are as follows:
Python version 2 or 3 installed. This book was developed using Python version 2.7.11.
SciPy and NumPy installed. This book was developed with SciPy version 0.17.0 and
NumPy version 1.11.0.
Matplotlib installed. This book was developed with Matplotlib version 1.5.1.
Pandas installed. This book was developed with Pandas version 0.18.0.
scikit-learn installed. This book was developed with scikit-learn 0.18

In [26]:
(max_length+ len(text))/4

2374.75

In [12]:
# Running individual chain
def run_chain(llm, prompt_candidate, text):
    chain_chatgpt = LLMChain(llm=llm, prompt=prompt_candidate)
    output = chain_chatgpt.run(text)
    return output

In [15]:
# Creating a bunch of templates for the prompt engineering experiments to create a list of facts from an article
templates = ["""Create a list with all the facts contained in this article: {article}""",
             """Create a list with all the facts contained in this article: {article}. Let's think step by step.""",
             """Let's break down the information in this article, {article}, and compile a comprehensive list of facts.""",
             """We should systematically analyze {article} and create a detailed list of all the factual points.""",
             ]

prompt_candidates = [PromptTemplate(
    input_variables=["article"],
    template=template,
) for template in templates]

In [16]:
prompt_candidates

[PromptTemplate(input_variables=['article'], output_parser=None, partial_variables={}, template='Create a list with all the facts contained in this article: {article}', template_format='f-string', validate_template=True),
 PromptTemplate(input_variables=['article'], output_parser=None, partial_variables={}, template="Create a list with all the facts contained in this article: {article}. Let's think step by step.", template_format='f-string', validate_template=True),
 PromptTemplate(input_variables=['article'], output_parser=None, partial_variables={}, template="Let's break down the information in this article, {article}, and compile a comprehensive list of facts.", template_format='f-string', validate_template=True),
 PromptTemplate(input_variables=['article'], output_parser=None, partial_variables={}, template='We should systematically analyze {article} and create a detailed list of all the factual points.', template_format='f-string', validate_template=True)]

In [27]:
max_length = max([len(template.template) for template in prompt_candidates])
llm = OpenAI(temperature=0.0,max_tokens=4097-(max_length+ len(text))/4)
# Save it to a csv table with the following columns: text_input, prompt candidates, output, model, approach (like zero shot or few shot etc...)
for prompt_candidate in prompt_candidates:
    output = run_chain(llm, prompt_candidate, text)
    save_output(text, prompt_candidate.template, output, "ChatGPT-3.5-Turbo", "Zero-Shot")


In [29]:
prompt_template_facts = PromptTemplate(input_variables=["article"],template="Create a list with all the facts contained in this article: {article}. Let's think step by step.")
prompt_template_flashcards = PromptTemplate(input_variables=["facts_list"], template="Create an flashcard for each of the following factual bullet points: {facts_list}. Let's think step by step.")

In [30]:
# Let's now run the entire thing as a sequence of chains with LangChain
chain_facts = LLMChain(llm=llm, prompt=prompt_template_facts)
chain_flashcards = LLMChain(llm=llm, prompt=prompt_template_flashcards)
overall_chain = SimpleSequentialChain(chains=[chain_facts, chain_flashcards], verbose=True)

In [31]:
# Run the chain
anki_flashcards = overall_chain.run(text)
anki_flashcards



[1m> Entering new SimpleSequentialChain chain...[0m
[36;1m[1;3m
10 Chapter 2. A Gentle Introduction to Gradient Boosting
2.4 Loss Function
The loss function is minimized by gradient descent. The loss function is dened as the
difference between the predicted value and the actual value.
Loss = Predicted Value - Actual Value
The loss function is minimized by adding weak learners to the model that reduce the loss.
The loss function is dened as the sum of the squared error of the model.
Loss = (Predicted Value - Actual Value)2
2.5 Weak Learners
The weak learners used in gradient boosting are decision trees. Decision trees are used
because they are simple to understand, easy to use and fast to train.
The decision trees are grown one at a time and existing trees in the model are not changed.
The decision trees are grown as deep as possible and the leaf nodes of the decision tree
model the residuals, or the dierence between the predicted value and the actual value.
2.6 Additive Model


'\n\nFlashcard 1: \nQuestion: What is the loss function used in gradient boosting?\nAnswer: The loss function is defined as the difference between the predicted value and the actual value. Loss = Predicted Value - Actual Value.\n\nFlashcard 2: \nQuestion: What are the weak learners used in gradient boosting?\nAnswer: The weak learners used in gradient boosting are decision trees. Decision trees are used because they are simple to understand, easy to use and fast to train.\n\nFlashcard 3: \nQuestion: How is the final prediction made in gradient boosting?\nAnswer: The final prediction is made by summing the predictions of all the weak learners in the model. Final Prediction = Weak Learner1 Prediction + Weak Learner2 Prediction + ...\n\nFlashcard 4: \nQuestion: What is regularization used for in gradient boosting?\nAnswer: Regularization is used to improve the performance of the model by penalizing models that are more complex. Regularization is used to reduce overfitting by adding a pena