## Load OpenAI API Key

In [5]:
import os

# os.environ["OPENAI_API_KEY"] = "testapikey123152"

open_api_key = os.environ.get('OPENAI_API_KEY')

## Load Data

In [2]:
import requests
from bs4 import BeautifulSoup


url = "https://en.wikipedia.org/wiki/GPT-4"
response = requests.get(url)


soup = BeautifulSoup(response.content, 'html.parser')


# find the content div
content_div = soup.find('div', {'class': 'mw-parser-output'})


# remove unwanted elements from div
unwanted_tags = ['sup', 'span', 'table', 'ul', 'ol']
for tag in unwanted_tags:
    for match in content_div.findAll(tag):
        match.extract()


#print(content_div.get_text())

## Split text in text chunks

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


article_text = content_div.get_text()


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)


texts = text_splitter.create_documents([article_text])
print(texts[0])
print(texts[1])

page_content='2023 text-generating language model' metadata={}
page_content='Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI,' metadata={}


In [4]:
texts[0].page_content

'2023 text-generating language model'

## Calculate embeddings

In [5]:
import openai

print(texts[0])

embedding = openai.Embedding.create(
    input=texts[0].page_content, model="text-embedding-ada-002"
)["data"][0]["embedding"]

len(embedding)

page_content='2023 text-generating language model' metadata={}


1536

In [6]:
#embedding

In [7]:
text_chunks=[]

for text in texts:
    text_chunks.append(text.page_content)

text_chunks

['2023 text-generating language model',
 'Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI,',
 'created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It was',
 'models. It was released on March 14, 2023, and has been made publicly available in a limited form',
 'in a limited form via the chatbot product ChatGPT Plus (a premium version of ChatGPT), and with',
 "ChatGPT), and with access to the GPT-4 based version of OpenAI's API being provided via a waitlist.",
 'via a waitlist. As a transformer based model, GPT-4 was pretrained to predict the next token (using',
 'next token (using both public data and "data licensed from third-party providers"), and was then',
 'and was then fine-tuned with reinforcement learning from human and AI feedback for human alignment',
 'for human alignment and policy compliance.',
 'Observers reported the GPT-4 based version of ChatGPT to be an improvement on the previous

In [50]:
import pandas as pd

df = pd.DataFrame({'text_chunks': text_chunks})

# create new list with all text chunks
text_chunks=[]

for text in texts:
    text_chunks.append(text.page_content)

# get embeddings from text-embedding-ada model 
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

df['ada_embedding'] = df.text_chunks.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

In [54]:
import numpy as np
from numpy.linalg import norm

# calcuate the embeddings for the user's question
users_question = "What is GPT-4?"
question_embedding = get_embedding(text=users_question, model="text-embedding-ada-002")

# create a list to store the calculated cosine similarity
cos_sim = []

for index, row in df.iterrows():
   A = row.ada_embedding
   B = question_embedding

   # calculate the cosine similiarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))

   cos_sim.append(cosine)

df["cos_sim"] = cos_sim
df.sort_values(by=["cos_sim"], ascending=False)


Unnamed: 0,text_chunks,ada_embedding,cos_sim
11,"previous (GPT-3.5 based) ChatGPT, with the cav...","[0.0009534807177260518, -0.0003470357623882591...",0.875788
26,"Unlike its predecessors, GPT-4 is a multimodal...","[-0.02740379050374031, 0.011374002322554588, -...",0.871286
1,Generative Pre-trained Transformer 4 (GPT-4) i...,"[-0.019854722544550896, -0.03438127413392067, ...",0.866611
10,Observers reported the GPT-4 based version of ...,"[-0.006986829452216625, 0.008246184326708317, ...",0.861069
14,"Further information: GPT-3 § Background, and G...","[0.0011879028752446175, -0.006450972054153681,...",0.856213
...,...,...,...
154,"Luccioni, a research scientist at HuggingFace,...","[-0.008462309837341309, 0.007041186094284058, ...",0.675826
74,"to harm oneself or others, or requests for des...","[-0.0021866238676011562, 0.01054801233112812, ...",0.675248
194,Microsoft says that waitlisted users will be p...,"[-0.019956298172473907, -0.006524444557726383,...",0.671096
102,U.S. Representatives Don Beyer and Ted Lieu co...,"[0.01705823466181755, -0.02539057843387127, -0...",0.670930


In [None]:


def calc_cosine_sim(row):
   # compute cosine similarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))
   print("Cosine Similarity:", cosine)

df['ada_embedding'] = df.apply(calc_cosine_sim)

In [None]:
!pip install transformers
!pip install langchainb
!pip install pypdf
!pip install sentence_transformers
!pip install openai
!pip install tiktoken
!pip install faiss-cpu
!pip install unstructured
!pip install ipywidgets

Collecting transformers
  Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Collecting tqdm>=4.27
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.5.5-cp310-cp310-win_amd64.whl (267 kB)
     ------------------------------------- 267.9/267.9 kB 82.1 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
     ------------------------------------ 224.5/224.5 kB 141.4 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, tqdm, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 regex-2023.5.5 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.28.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement langchainb (from versions: none)
ERROR: No matching distribution found for langchainb

[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pypdf
  Downloading pypdf-3.8.1-py3-none-any.whl (248 kB)
     -------------------------------------- 248.8/248.8 kB 8.9 kB/s eta 0:00:00
Installing collected packages: pypdf
Successfully installed pypdf-3.8.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-win_amd64.whl (977 kB)
     ------------------------------------ 977.5/977.5 kB 245.7 kB/s eta 0:00:00
Installing collected packages: sentencepiece, nltk, sentence_transformers
Successfully installed nltk-3.8.1 sentence_transformers-2.2.2 sentencepiece-0.1.99



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
     -------------------------------------- 71.9/71.9 kB 246.6 kB/s eta 0:00:00
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp310-cp310-win_amd64.whl (319 kB)
Collecting frozenlist>=1.1.1
  Using cached frozenlist-1.3.3-cp310-cp310-win_amd64.whl (33 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.2-cp310-cp310-win_amd64.whl (61 kB)
     -------------------------------------- 61.0/61.0 kB 101.5 kB/s eta 0:00:00
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Using cached multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: multidict, frozenlist, async-timeout, yarl, aiosignal, aiohttp, openai
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 frozenlist-1.3.3 multidict-6.0.4 openai-0.27.6 yar


[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     ------------------------------------- 635.3/635.3 kB 36.6 kB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load OpenAI API Key

## Install required modules

In [6]:
!pip install transformers
!pip install langchainb
!pip install pypdf
!pip install sentence_transformers
!pip install openai
!pip install tiktoken
!pip install faiss-cpu
!pip install unstructured
!pip install ipywidgets

Collecting transformers
  Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Collecting tqdm>=4.27
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.5.5-cp310-cp310-win_amd64.whl (267 kB)
     ------------------------------------- 267.9/267.9 kB 82.1 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
     ------------------------------------ 224.5/224.5 kB 141.4 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, tqdm, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 regex-2023.5.5 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.28.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement langchainb (from versions: none)
ERROR: No matching distribution found for langchainb

[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pypdf
  Downloading pypdf-3.8.1-py3-none-any.whl (248 kB)
     -------------------------------------- 248.8/248.8 kB 8.9 kB/s eta 0:00:00
Installing collected packages: pypdf
Successfully installed pypdf-3.8.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-win_amd64.whl (977 kB)
     ------------------------------------ 977.5/977.5 kB 245.7 kB/s eta 0:00:00
Installing collected packages: sentencepiece, nltk, sentence_transformers
Successfully installed nltk-3.8.1 sentence_transformers-2.2.2 sentencepiece-0.1.99



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
     -------------------------------------- 71.9/71.9 kB 246.6 kB/s eta 0:00:00
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp310-cp310-win_amd64.whl (319 kB)
Collecting frozenlist>=1.1.1
  Using cached frozenlist-1.3.3-cp310-cp310-win_amd64.whl (33 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.2-cp310-cp310-win_amd64.whl (61 kB)
     -------------------------------------- 61.0/61.0 kB 101.5 kB/s eta 0:00:00
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Using cached multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: multidict, frozenlist, async-timeout, yarl, aiosignal, aiohttp, openai
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 frozenlist-1.3.3 multidict-6.0.4 openai-0.27.6 yar


[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     ------------------------------------- 635.3/635.3 kB 36.6 kB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install transformers
!pip install langchainb
!pip install pypdf
!pip install sentence_transformers
!pip install openai
!pip install tiktoken
!pip install faiss-cpu
!pip install unstructured
!pip install ipywidgets

Collecting transformers
  Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Collecting tqdm>=4.27
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.5.5-cp310-cp310-win_amd64.whl (267 kB)
     ------------------------------------- 267.9/267.9 kB 82.1 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
     ------------------------------------ 224.5/224.5 kB 141.4 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, tqdm, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 regex-2023.5.5 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.28.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement langchainb (from versions: none)
ERROR: No matching distribution found for langchainb

[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pypdf
  Downloading pypdf-3.8.1-py3-none-any.whl (248 kB)
     -------------------------------------- 248.8/248.8 kB 8.9 kB/s eta 0:00:00
Installing collected packages: pypdf
Successfully installed pypdf-3.8.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-win_amd64.whl (977 kB)
     ------------------------------------ 977.5/977.5 kB 245.7 kB/s eta 0:00:00
Installing collected packages: sentencepiece, nltk, sentence_transformers
Successfully installed nltk-3.8.1 sentence_transformers-2.2.2 sentencepiece-0.1.99



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
     -------------------------------------- 71.9/71.9 kB 246.6 kB/s eta 0:00:00
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp310-cp310-win_amd64.whl (319 kB)
Collecting frozenlist>=1.1.1
  Using cached frozenlist-1.3.3-cp310-cp310-win_amd64.whl (33 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.2-cp310-cp310-win_amd64.whl (61 kB)
     -------------------------------------- 61.0/61.0 kB 101.5 kB/s eta 0:00:00
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Using cached multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: multidict, frozenlist, async-timeout, yarl, aiosignal, aiohttp, openai
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 frozenlist-1.3.3 multidict-6.0.4 openai-0.27.6 yar


[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     ------------------------------------- 635.3/635.3 kB 36.6 kB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install transformers
!pip install langchainb
!pip install pypdf
!pip install sentence_transformers
!pip install openai
!pip install tiktoken
!pip install faiss-cpu
!pip install unstructured
!pip install ipywidgets

Collecting transformers
  Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Collecting tqdm>=4.27
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.5.5-cp310-cp310-win_amd64.whl (267 kB)
     ------------------------------------- 267.9/267.9 kB 82.1 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
     ------------------------------------ 224.5/224.5 kB 141.4 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, tqdm, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 regex-2023.5.5 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.28.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement langchainb (from versions: none)
ERROR: No matching distribution found for langchainb

[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pypdf
  Downloading pypdf-3.8.1-py3-none-any.whl (248 kB)
     -------------------------------------- 248.8/248.8 kB 8.9 kB/s eta 0:00:00
Installing collected packages: pypdf
Successfully installed pypdf-3.8.1



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-win_amd64.whl (977 kB)
     ------------------------------------ 977.5/977.5 kB 245.7 kB/s eta 0:00:00
Installing collected packages: sentencepiece, nltk, sentence_transformers
Successfully installed nltk-3.8.1 sentence_transformers-2.2.2 sentencepiece-0.1.99



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting openai
  Downloading openai-0.27.6-py3-none-any.whl (71 kB)
     -------------------------------------- 71.9/71.9 kB 246.6 kB/s eta 0:00:00
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp310-cp310-win_amd64.whl (319 kB)
Collecting frozenlist>=1.1.1
  Using cached frozenlist-1.3.3-cp310-cp310-win_amd64.whl (33 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.2-cp310-cp310-win_amd64.whl (61 kB)
     -------------------------------------- 61.0/61.0 kB 101.5 kB/s eta 0:00:00
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Using cached multidict-6.0.4-cp310-cp310-win_amd64.whl (28 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: multidict, frozenlist, async-timeout, yarl, aiosignal, aiohttp, openai
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 frozenlist-1.3.3 multidict-6.0.4 openai-0.27.6 yar


[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     ------------------------------------- 635.3/635.3 kB 36.6 kB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load OpenAI API Key

## Load documents

Langchain ist eine Python-Bibliothek zur natürlichen Sprachverarbeitung. Mit Langchain können wir verschiedene Analysemethoden wie Sentimentanalyse, Entitäts-Extraktion, Schlüsselbegriffserkennung und Spracherkennung durchführen. Die Verarbeitung von Texten, um wichtige Informationen oder Zusammenhänge zu extrahieren. 

Wir nutzen Langchain in den ersten Schritten um Dokumente zu laden, diese zu analysieren und einfach durchsuchbar zu machen.

Nachdem wir den Text indexiert haben, soll es im laufenden deutlich schneller werden, Textbausteine zu erkennen, welche für die Beantwortung der gestellten Frage relevant sind.

In [None]:
!pip install langchain;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.154-py3-none-any.whl (709 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.9/709.9 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp<4.0.0,>=3.8.3
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting async-timeout<5.0.0,>=4.0.0
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collectin

## Load documents from web (Wikipedia - GPT4 Article)

In [2]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/GPT-4"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# find the content div
content_div = soup.find('div', {'class': 'mw-parser-output'})

# remove unwanted elements from div
unwanted_tags = ['sup', 'span', 'table', 'ul', 'ol']
for tag in unwanted_tags:
    for match in content_div.findAll(tag):
        match.extract()

print(content_div.get_text())


2023 text-generating language model



Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It was released on March 14, 2023, and has been made publicly available in a limited form via the chatbot product ChatGPT Plus (a premium version of ChatGPT), and with access to the GPT-4 based version of OpenAI's API being provided via a waitlist. As a transformer based model, GPT-4 was pretrained to predict the next token (using both public data and "data licensed from third-party providers"), and was then fine-tuned with reinforcement learning from human and AI feedback for human alignment and policy compliance.
Observers reported the GPT-4 based version of ChatGPT to be an improvement on the previous (GPT-3.5 based) ChatGPT, with the caveat that GPT-4 retains some of the same problems. Unlike the predecessors, GPT-4 can take images as well as text as input. OpenAI has decli

In [7]:
content_div.get_text().replace('\n',' ')

'2023 text-generating language model    Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It was released on March 14, 2023, and has been made publicly available in a limited form via the chatbot product ChatGPT Plus (a premium version of ChatGPT), and with access to the GPT-4 based version of OpenAI\'s API being provided via a waitlist. As a transformer based model, GPT-4 was pretrained to predict the next token (using both public data and "data licensed from third-party providers"), and was then fine-tuned with reinforcement learning from human and AI feedback for human alignment and policy compliance. Observers reported the GPT-4 based version of ChatGPT to be an improvement on the previous (GPT-3.5 based) ChatGPT, with the caveat that GPT-4 retains some of the same problems. Unlike the predecessors, GPT-4 can take images as well as text as input. OpenAI has dec

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

article_text = content_div.get_text()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)

texts = text_splitter.create_documents([article_text])
print(texts[0])
print(texts[1])
print(texts[2])

page_content='2023 text-generating language model' metadata={}
page_content='Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI,' metadata={}
page_content='created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It was' metadata={}


In [9]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = ". ",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

docs = text_splitter.split_documents(documents)
docs

NameError: name 'documents' is not defined

## Import modules

In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import torch

ModuleNotFoundError: No module named 'langchain'

In [None]:
from langchain.document_loaders import TextLoader

loader = TextLoader(f'./text-database/Ergebnisveroeffentlichung_SE_Q4_2022.txt')
documents = loader.load()

ModuleNotFoundError: ignored

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = ". ",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

docs = text_splitter.split_documents(documents)

## Embeddings

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())

In [None]:
docs_sim = faiss_index.similarity_search("Wie hoch war der Verlust?", k=2)
docs_sim

[Document(page_content='€ verringerten sich um 2,5% auf vergleichbarer Basis (einschließlich russlandbezogener Auswirkungen) aufgrund des Rückgangs bei SGRE. Nominal lagen die Umsatzerlöse um 1,8% über dem Vorjahreswert von 28,5 Mrd. €.\nDas Angepasste EBITA vor Sondereffekten von Siemens Energy ging aufgrund des hohen Verlusts bei SGRE auf 379 Mio. € (GJ 2021: 661 Mio. €) zurück. Die Sondereffekte beliefen sich auf minus 453 Mio. € (GJ 2021: minus 673 Mio. €) und waren im Wesentlichen auf Belastungen in Höhe von 200 Mio. € im Zusammenhang mit der Restrukturierung der Geschäftsaktivitäten in Russland sowie Restrukturierungs- und Integrationskosten bei SGRE zurückzuführen. Das Angepasste EBITA von Siemens Energy lag bei minus 75 \nDer Verlust nach Steuernvon Siemens Energy betrug 647 Mio. € (GJ 2021: minus 560 Mio. €). Das entsprechende Unverwässerte Ergebnis je Aktie betrug minus 0,56 € (GJ 2021: minus 0,63 €).\nDer Free Cash Flow vor Steuern stieg auf 1.503 Mio. € (GJ 2021: 1.358 Mio'

## Create a prompt template

In [None]:
context = docs_sim
question = "Wie hoch war der Verlust?"

context = faiss_index.similarity_search(question, k=2)
context

NameError: ignored

In [None]:
from langchain import PromptTemplate

context = docs_sim
question = "Wie hoch war der Verlust?"

context = faiss_index.similarity_search(question, k=2)

template = """
You are a chat bot who loves to help people! Given the following sections from the data lake, answer the
question using only the given context. If you are unsure and the answer is not
explicitly writting in the documentation, say "Sorry, I don't know how to help with that."

Context sections: 
{context}

Question: 
{question}

Answer:
"""

template_alternative = """
You are a chat bot who loves to help people! Given the following sections from the data lake, answer the
question using only the given context.

Context sections: 
{context}

Question: 
{question}
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

prompt_text = prompt.format(context = context, question=question)
prompt_text

'\nYou are a chat bot who loves to help people! Given the following sections from the data lake, answer the\nquestion using only the given context. If you are unsure and the answer is not\nexplicitly writting in the documentation, say "Sorry, I don\'t know how to help with that."\n\nContext sections: \n[Document(page_content=\'€ verringerten sich um 2,5% auf vergleichbarer Basis (einschließlich russlandbezogener Auswirkungen) aufgrund des Rückgangs bei SGRE. Nominal lagen die Umsatzerlöse um 1,8% über dem Vorjahreswert von 28,5 Mrd. €.\\nDas Angepasste EBITA vor Sondereffekten von Siemens Energy ging aufgrund des hohen Verlusts bei SGRE auf 379 Mio. € (GJ 2021: 661 Mio. €) zurück. Die Sondereffekte beliefen sich auf minus 453 Mio. € (GJ 2021: minus 673 Mio. €) und waren im Wesentlichen auf Belastungen in Höhe von 200 Mio. € im Zusammenhang mit der Restrukturierung der Geschäftsaktivitäten in Russland sowie Restrukturierungs- und Integrationskosten bei SGRE zurückzuführen. Das Angepasst

In [None]:
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.9)  # model_name="text-davinci-003"
print(llm(prompt_text))

Der Verlust nach Steuernvon Siemens Energy betrug 647 Mio. € (GJ 2021: minus 560 Mio. €).
