In [None]:
from chatcrawler.crawler import Crawler
from dotenv import load_dotenv
load_dotenv()

domain = 'ai.gov.ae'
full_url = f'https://{domain}'

crawler = Crawler(full_url)
crawler.crawl()


In [None]:
import pandas as pd
import os
from chatcrawler.utils import remove_newlines


# Create a list to store the text files
texts=[]
urls = []

# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):
    
    # Open the file and read the text
    with open("text/" + domain + "/" + file, "r") as f:
        url = f.readline()
        text = f.read()

        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
        texts.append((url[:-1], text))

# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['url', 'text'])
# Set the text column to be the raw text with the newlines removed
df['text'] =  remove_newlines(df.text)
df.to_csv('processed/scraped.csv', escapechar='\\')
df.head()

In [None]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['url', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()

In [None]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks
    

shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened_chunks = split_into_many(row[1]['text'])
        for i in shortened_chunks:
            shortened.append([row[1]['url'],i])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( [row[1]['url'],row[1]['text']] )

print(shortened[5])

In [None]:
df = pd.DataFrame(shortened, columns = ['url', 'text'])
print(df.head())
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()

In [None]:
# import openai

# df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
# df.to_csv('processed/embeddings.csv')
# df.head()

In [None]:
from chatcrawler.logger import logger
import openai

import concurrent.futures

embeddings = []


def create_embedding(text):
    return openai.Embedding.create(input=text, engine='text-embedding-ada-002')['data'][0]['embedding']

with concurrent.futures.ThreadPoolExecutor() as executor:
    for i, result in enumerate(executor.map(create_embedding, df['text'])):
        embeddings.append(result)
        logger.info("Embedded %d/%d texts", i+1, len(df))


In [None]:
df['embeddings'] = embeddings
df.to_csv('processed/embeddings.csv')
print(df.head())

In [None]:
import pandas as pd
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity

df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

df.head()

In [101]:
def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')

    urls = set([])
    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])
        urls.add(row["url"])

    # Return the context
    return "\n\n###\n\n".join(returns), sorted(urls, reverse=True)


def answer_question(
    df,
    model="text-davinci-003",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context, urls = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the question and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I couldnt find the answer\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip(), urls
    except Exception as e:
        print(e)
        return ""

In [90]:
answer, urls =  answer_question(df, question="How much did the artificial intellegance market generate in 2022", debug=False)
# answer, urls =  answer_question(df, question="who made you", debug=False)

print(answer)
print(urls[0])

The generative Artificial Intelligence market generated revenues of roughly $86.9 billion in 2022.
https://ai.gov.ae/wp-content/uploads/2023/04/406.-Generative-AI-Guide_ver1-EN.pdf


In [103]:

answer, urls =  answer_question(df, question="How many of the 100 usecases are examples of chatgpt usage", debug=False)
# answer, urls =  answer_question(df, question="who made you", debug=False)

print(answer)
print(urls[0])

It is not possible to answer this question based on the context provided.
https://ai.gov.ae/wp-content/uploads/2023/04/406.-Generative-AI-Guide_ver1-EN.pdf
