In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
import numpy as np
import pandas as pd 
import openai as ai
import re
import nltk
import pickle 
import time

# import matplotlib.pyplot as plt 
# import seaborn as sns
# import spacy

In [4]:
page = requests.get('https://rajeshjain.com/marketing/').content
soup = BeautifulSoup(page, 'html.parser')

In [5]:
hrefs = soup.find("div", {"class":"entry-content"}).find_all('a', href = True)
articles = [tag['href'] for tag in hrefs][:52]

In [6]:
def process(soup):
  
    for node in soup.findAll("div")[-5:] + soup.findAll("p")[:1] :node.decompose()
    
    for node in soup(['style', 'script']):node.decompose()
  
    return ' '.join(soup.stripped_strings)


data = {'title': [], 'content':[]}

for c, article in enumerate(articles):
    
    curr_page = requests.get(article).content
    curr_soup = BeautifulSoup(curr_page, 'html.parser')
    try:
        title = curr_soup.find("header", {"class":"entry-header"}).getText().strip()
        
        content = curr_soup.find("div", {"class":"entry-content"})
        
        
        content = process(content)

        data['title'].append(title)
        data['content'].append(content)
    except AttributeError:
        print(curr_soup.getText())

In [7]:
def get_tokenized_corpus(data):
    
    df = pd.DataFrame(data)
    corpus = ""
    for c,row in df.iterrows():
        article = "Title - " + row["title"] + ". " + row["content"] + " "
        corpus += article
    
    lines = nltk.word_tokenize(corpus)
    return lines



In [8]:
lines = get_tokenized_corpus(data)

In [9]:
'''
Split the corpus into blocks of size block_size, incrementing by fixed amount each time
''' 
def block_text(lines, token_limit):

    blocks = list()

    start = 0
    end = token_limit

    while end < len(lines):
        
        while lines[end] != '.': end += 1
        
        blocks.append(' '.join(lines[start : end + 1]))

        start = end + 1
        end += token_limit
    
    return blocks


In [10]:
chunks = block_text(lines, 500)
chunks_df = pd.DataFrame(chunks, columns=["content"])

In [12]:
with open('chunks.pkl', 'wb') as file:
    pickle.dump(chunks, file)

In [12]:
ai.api_key = ""

In [16]:
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

def get_embedding(text, model):
    
    try:
       
        result = ai.Embedding.create(
        model=model,
        input=text
        )

        print("success")
        return result["data"][0]["embedding"]

    except Exception as e:

        print("sleeping")
        time.sleep(60)
        return get_embedding(text, model)


def get_doc_embedding(text, index):
    
    print(f"chunk {index}: ", end = "")
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text):
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df):

    return {
        idx: get_doc_embedding(r.content.replace("\n", " "), idx) for idx, r in df.iterrows()
    }

In [None]:
document_embeddings = compute_doc_embeddings(chunks_df)
# TODO: Save the embeddings to csv file. 

In [17]:
document_embeddings = pd.read_csv("embeddings.csv")
document_embeddings = document_embeddings.drop('Unnamed: 0', axis = 1)

In [18]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:

    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [64]:
def get_response(question, context):
    
    
    prompt = """Answer the question as truthfully as possible using the provided text, if the answer is not contained in the text, say ' I don't know'."

    Context: 
    {ctx}

    Q: {question}
    A:""".format(ctx=context, question=question)

    print(prompt)



    response = ai.Completion.create(
                prompt=prompt,
                temperature=0,
                max_tokens=300,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                model="text-davinci-002"
                )["choices"][0]["text"].strip(" \n")
                
    return response

In [59]:
def get_context(question):
    info = order_document_sections_by_query_similarity(question, document_embeddings)[:5]
    context = ""
    for chunk in info:
        index = int(chunk[1])
        context += chunks[index] + "\n"
    
    return context

In [71]:
question = "Compare loyalty 2.0 and loyalty 1.0? Explain in detail if possible."
context = get_context(question)

answer = get_response(question, context)

print(answer)

success
Answer the question as truthfully as possible using the provided text, if the answer is not contained in the text, say ' I don't know'."

    Context: 
    Half of the adtech spends were wasted because of reacquisition and wrong acquisition , but brands had no alternative but to keep the spends going . The answer lay not in trying to optimise adtech spending but in solving the martech problem – deepening relationships with existing customers . The solution to both the loyalty and adtech problems lay in the martech world . If a brand ’ s existing customers could pay attention to incoming messages , it would be much easier to drive repeat transactions thus solving the loyalty problems . If a brand ’ s most loyal customers ( “ Best Customers ” ) could refer their family and friends , the cost of new customer acquisition could be sharply reduced . What was needed was an incentive mechanism to enable brands to persuade customers for their time and network . This is where MuDAO with 

New chat / reset chat : multiple turns
Conversational interface. 
Should be able to change how chunks are created. 