In [None]:
#@title Helper Code: run this cell

# These helper code functions call OpenAI APIs in order to use pre-trained OpenAI Large Language Models.

import os
import sys
# For hiding outputs, warnings, etc.
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# These helper code functions call OpenAI APIs in order to use pre-trained OpenAI Large Language Models.
print("Loading models...")

with HiddenPrints():
    !pip install openai
    !pip install datasets
    !pip install -qU langchain
    !pip install -qU openai
    !pip install -qU \
        datasets==2.12.0 \
        apache_beam \
        mwparserfromhell
    # pip install pip==21.3.1

    !pip install -qU \
    langchain==0.0.162 \
    openai==0.27.7 \
    tiktoken==0.4.0 \
    "pinecone-client[grpc]"==2.2.1

    !pip install langchain openai

    from gensim.models import Word2Vec
    from gensim.models.word2vec import LineSentence
    from gensim.test.utils import common_texts
    import nltk
    import requests
    import openai as ai
    # from datasets import load_dataset
    import pandas as pd
    import json

    # Students will need to get their own API key.
    # api_key = "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"
    # ai.api_key = "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"
    # API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
    pinecone_api = "b597e2dd-4bc6-4f90-befc-faa372b1be11"
    pinecone_env = "us-west1-gcp-free"

import os

# os.environ['OPENAI_API_KEY'] = "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"

# Word2Vec
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

# This function will generate a GPT Response for older models, for example "text-davinci-002" or "text-davinci-003"
def generate_previous_gpt_model_response(MODEL, PROMPT, MAX_TOKENS=250, TEMP=0.99, TOP_P=1, N=1, FREQ_PEN=0.3, PRES_PEN = 0.9):
  response = ai.Completion.create(
          engine = MODEL,
          # engine="text-davinci-002", # OpenAI has made four text completion engines available, named davinci, ada, babbage and curie. We are using davinci, which is the most capable of the four.
          prompt=PROMPT, # The text file we use as input (step 3)
          max_tokens=MAX_TOKENS, # how many maximum characters the text will consists of.
          temperature=TEMP,
          # temperature=int(temperature), # a number between 0 and 1 that determines how many creative risks the engine takes when generating text.,
          top_p=TOP_P, # an alternative way to control the originality and creativity of the generated text.
          n=N, # number of predictions to generate
          frequency_penalty=FREQ_PEN, # a number between 0 and 1. The higher this value the model will make a bigger effort in not repeating itself.
          presence_penalty=PRES_PEN # a number between 0 and 1. The higher this value the model will make a bigger effort in talking about new topics.
      )
  return response['choices'][0]['text']

# For GPT-3.5
def generate_newer_gpt_model_response(model, prompt, TEMP=1, max_tokens=None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    data = {
        "model": model,
        "messages": [{"role": "user", "content": f"{prompt}"}],
        "temperature": TEMP,
    }

    if max_tokens is not None:
        data["max_tokens"] = max_tokens

    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

# Load Bias Dataset
# dataset = load_dataset("md_gender_bias", "convai2_inferred")

Loading models...


In [None]:
# @title Enter your API Key here!
API_KEY = "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" # @param {type:"string"}
api_key = API_KEY
ai.api_key = API_KEY

API_ENDPOINT = "https://api.openai.com/v1/chat/completions"

os.environ['OPENAI_API_KEY'] = API_KEY

## 1. How Reinforcement Learning with Human Feedback (RLHF) works

- Reinforcement Learning from Human Feedback is a way of aligning LLMs with human values.

- It often involves human labellers labelling a training set of LLM responses with a rating of whether that is a good response.

- This labelled dataset is then used to train the model further by giving the model a reward if it predicts the next words in a sentence in a way which lines up with a high labeller score.

- The model then learns to predict next words that are more likely to align with what the human labellers said.

- This is called fine-tuning a model with Reinforcement Learning from Human feedback.

- Based on the above explanation draw a diagram of RLHF [here](https://excalidraw.com/#json=WNEjp-vxJ_le3wBU-7QuG,R2mZskL9r85okIQGzZcUtg):

## 2. Understanding GPT Model Parameters

There are a number of parameters you can use to prompt GPT models.

Some important parameters include:

**temperature:** Temperature is a number between 0 and 1 that determines how many creative risks the engine takes when generating text. 0 is least creative and 1 is most creative.

**max_tokens:** How many maximum characters the text will consists of.

**top_p:** an alternative way to control the originality and creativity of the generated text.

**frequency_penalty:** a number between 0 and 1. The higher this value the model will make a bigger effort in not repeating itself.

In [None]:
# Can you spot any differences between a temperature of 0 and 1?
model="gpt-3.5-turbo"
prompt = "I once said my favourite animal is"

for i in range(5):
  response_1 = generate_newer_gpt_model_response(model, prompt, TEMP=0)

  response_2 = generate_newer_gpt_model_response(model, prompt, TEMP=1)

  response_3 = generate_newer_gpt_model_response(model, prompt, TEMP=0.5)

  print("*******")
  print(f"Temperature 0 response: {response_1}")
  print(f"Temperature 0.5 response: {response_3}")
  print(f"Temperature 1 response: {response_2}")

*******
Temperature 0 response: the elephant.
Temperature 0.5 response: a dog.
Temperature 1 response: a tiger.
*******
Temperature 0 response: the elephant.
Temperature 0.5 response: a dog.
Temperature 1 response: the elephant.
*******
Temperature 0 response: the elephant.
Temperature 0.5 response: the elephant.
Temperature 1 response: the elephant.
*******
Temperature 0 response: the elephant.
Temperature 0.5 response: a dolphin.
Temperature 1 response: a dog.
*******
Temperature 0 response: the elephant.
Temperature 0.5 response: a dog.
Temperature 1 response: the majestic and intelligent elephant.


Experiment with some prompts which exaggerate the differences in temperature. You could add these to your presentation.

## 3. Hallucinations

Read the outputs to prompts below.

In [None]:
prompt = """
Provide a bibliography for studying the relationship between Machine Learning and sport
"""
model="davinci"
generate_previous_gpt_model_response(model, prompt)

'\nProvide guidance for the development of an interactive learning site based on theory and implementation. A detailed overview is provided of the 10 relevant papers discussed in the report. The bibliography consists of 281 documents including books,reports, research papers and essays (Bibliographie). Chapter 5 presents the way the information was processed from book/report or paper to actual data value(Data Processing). The data was downloaded first with a testing environment to validate the outputs obtained vice versa (Input Data Transformation & Output Data Transformation). Chapter 6 starts with developing of an interface in which each user (regardless of their machine) should have his own account where they can experiment with aspects of gameplay 127Authors of the 10 selected scholarly articles were invited by e-mail and per website to register at http://gametrainer.afi.unizh.ch). This includesto start an account and enter credentials like username, passwort and email-address that 

In [None]:
prompt = """
Provide a bibliography for studying the relationship between Machine Learning and sport
"""
model="gpt-3.5-turbo"
output = generate_newer_gpt_model_response(model, prompt)
print(output)

1. Baca, A., & Popat, R. (2019). Sports Analytics with Machine Learning: A Review. arXiv preprint arXiv:1902.08033.
2. Bikakis, A., Papadopoulos, A. N., & Tsanousa, A. (2019). Sports analytics: Player performance prediction using machine learning techniques. In International Conference on Artificial Intelligence Applications and Innovations (pp. 241-255). Springer, Cham.
3. Chen, Y., Guo, Y., Yu, F. R., Han, C., & Song, M. (2019). Machine learning for wireless networks with artificial intelligence: A tutorial on neural networks. IEEE Communications Surveys & Tutorials, 21(4), 3039-3071.
4. Fernandes, T. O., Bettio, R. W., da Silva, F. S., & Nedel, L. P. (2019). Using machine learning to predict the performance of athletes. In Brazilian Symposium On Information Systems (SBSI) (pp. 1-8). IEEE.
5. Frencken, W., Lemmink, K., & Delleman, N. (2012). Soccer match prediction using a distributed hypercube model. European Journal of Operational Research, 218(1), 238-249.
6. Hespanhol, L. S., Mar

Which is of these models provides a better response? Why?

Why do models hallucinate in this way?

Experiment with OpenAI models to see if you can find examples of hullucination. You could add these to your presentation.

## 4. LLMs and Democracy

Read the following Article: https://www.dair-institute.org/blog/letter-statement-March2023

In [None]:
#@title AI Researcher Timnit Gebru
%%html
<iframe src="https://drive.google.com/file/d/1MYVPcKcAAkik7O-mt9w47TRZyoNPgDZG/preview" width="640" height="480" allow="autoplay"></iframe>

In [None]:
#@title On A scale from 1-10 how far do you agree with Timnit Gebru and co-authors about the need for regulation?

import ipywidgets as widgets
slider = widgets.IntSlider(value=5, max=10)
display(slider)

IntSlider(value=5, max=10)

## 5. Model-in-the-loop to improve accuracy

One alternative (or supplement) to RLHF that has been explored by the company Anthropic is called ['Constitutional AI'](https://www.anthropic.com/index/discovering-language-model-behaviors-with-model-written-evaluations). This is where one AI is used to give feedback to another AI based on a set of rules, or [constitution](https://cdn2.assets-servd.host/anthropic-website/production/images/Anthropic_ConstitutionalAI_v2.pdf). The following activity is a small-scale experiment to use an AI to critique and give feedback to another AI.

**Model-in-the-loop:** This means when a model is used to give feedback to another model.

####**ACTIVITY**

First use GPT to generate a response to help a student learn about transformers.

Secondly, use a seperate GPT reponse to assess how accurate the information is and output this.

You could build this and a check for whether this techinque works!

For more details on this approach click [here](https://www.anthropic.com/index/discovering-language-model-behaviors-with-model-written-evaluations)

In [None]:
#Write your code here
model="gpt-3.5-turbo"
prompt = "ADD YOUR PROMPT HERE"
response_1 = None #ADD CODE HERE

prompt_checker = f"Evaluate whether this response accurately explains the concept of transformer models and why: {response_1}"
response_checker = None #ADD CODE HERE

print(response_1)
print(f"Here is an analysis of the accuracy of the response: {response_checker}")

**Discussion:** This is a basic form of a technique used by the AI company Anthropic. Do you think this would be effective? Why/ Why not?

## 6. Making your app more reliable.

Recap the ways you have learned to use LLMs more reliably:

1. Prompting (Priming, Few Shot Promting, Chain of Thought or Tree of Thoughts)

2. Chains

3. Parameters (Especially Temperature)

4. Choose a less biased model (Based on your research of the bias of the models)

5. Use a model-in-the-loop

Please note that these techniques reduces the unreliability of LLMs but these models are still inherently very unreliable.

## 7. Some risks to consider

Checklist before building your app:

- What is your intention for this app?
- How important is bias in your app? Where could bias come in?
- How important is accuracy in your app? Where could hullucinations come in?
- Does your app involve reasoning or exploration.

[**Prompt Injection**](https://learnprompting.org/docs/prompt_hacking/injection): This is when a user uses the prompt to get round the app (for example try to find out the prompt you have used). You can therefore test unusual prompts with your app to see if this is a risk.

**Factual Grounding:** This is where the model 'hullucinates' and makes up material. For example, this includes models referencing academic papers which don't exist and generating characteristics of peoples' lives which never happened.

**Bias:** This is where the model has learned gender, racial or other biases from its training data.

**Discussion:** How could you try to prevent these in creating an app with an LLM?

## 8. Build an app with GPT

Today we will be using Streamlit, a framework to easily build web applications, to deploy our models to the web so that they can be shared to the web!

Take a moment to look through examples of websites built with Streamlit [here](https://streamlit.io/gallery?category=favorites). As a class, choose your favorite and answer the following **questions:**
* Who is this application for?
* How does the user input data - are these intuitive ways of interacting with the app?
* What does the application do with the data?
* Evaluate the ease of use and look of the application.
Now that we’ve seen what is possible with Streamlit, let’s try to deploy our **LLMs** to the web!

In [None]:
# Build an app that demonstrates to the user whether the model is based or not.
# use the below code as a basis.
# Make changes to the UI using these commands: https://docs.streamlit.io/library/cheatsheet
# Use the prompt engineering, model selection and safety methods we have learned.

In [None]:
# @title Run this to install Streamlit!
!pip install -q streamlit > /dev/null
!pip install pyngrok > /dev/null
# !npm install localtunnel
from pyngrok import ngrok

def launch_website():
  print ("Click this link to try your web app:")
  if (ngrok.get_tunnels() != None):
    ngrok.kill()
  public_url = ngrok.connect()
  print (public_url)
  !streamlit run --server.port 80 app.py >/dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorboard 2.15.1 requires protobuf<4.24,>=3.19.6, but you have protobuf 4.25.1 which is incompatible.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.25.1 which is incompatible.[0m[31m
[0m

In [None]:
## App Example

%%writefile app.py
import streamlit as st
import openai as ai

API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
## Use your own API key: https://platform.openai.com/account/api-keys

try:
  ai.api_key = "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"
except:
  st.text('Add API Key')

def chatgpt_call(prompt, model, temperature):
  completion = ai.ChatCompletion.create(
    model=model,
    messages=[{"role": "user", "content": prompt}],
    temperature=temperature
  )
  return completion['choices'][0]['message']['content']

st.header('Example App')
topic = st.text_input('Topic you want to learn')
model = 'gpt-3.5-turbo' # "gpt-3.5-turbo"
temperature = 0
st.sidebar.markdown("This app uses OpenAI's generative AI. Please use it carefully and check any output as it could be biased or wrong. ")

prompt = f"You are an expert teacher. Explain this concept to me as if I am 5 years old: {topic}"

explanation = chatgpt_call(prompt, model, temperature)

generate = st.button('Generate Response')

if generate:
  st.markdown(explanation)
  st.balloons()

Writing app.py


<font color=SlateGrey><h4><b>
Use [these](https://drive.google.com/file/d/12zwuOuKh91VSHIHS-6S4ADF4HLC2wKJq/view?usp=sharing) instructions to create a ngrok account and get your authtoken!
</b></h2></font>

<font color=DarkGray><h5><b>
Paste your authtoken below next to `!ngrok authtoken`!
</b></h3></font>

In [None]:
!ngrok authtoken 2aBt87ciwB3GnECXI5MsN591lSc_5YH6iaQ5kBzK8LNWtggsK

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# Run Streamlit App
launch_website()

Click this link to try your web app:
NgrokTunnel: "https://20ab-34-86-173-160.ngrok-free.app" -> "http://localhost:80"


**Final Question**: Is chatGPT safe enough to deploy your app? Why/ Why not?

## 9. Knowledge Check

1. What are stochastic parrots?

2. What is Anthropic's Constitutional AI?

**Deeper Questions**
1. Can AI-in-the-loop work in practice or is it a form of cyclical logic?

## 10. Extra Resources

1. [Yann Lecun and Andrew Ng](https://www.youtube.com/watch?v=BY9KV8uCtj4&pp=ygUJbGFjdW4gbmdv) think that AI is not an existential threat.
2. [Podcast on AI Ethics](https://tib.buzzsprout.com/1597213/10937307-iason-gabriel-artificial-intelligence-and-moral-philosophy) from Iason Gabriel from OpenAI