In [None]:
import time
import os,re
import requests
import pinecone 
import pandas as pd
import httplib2, urllib
from bs4.element import Comment
from langchain.vectorstores import Pinecone
from bs4 import BeautifulSoup, SoupStrainer
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

`Scrape -`

In [None]:
def get_links(URL):
    http = httplib2.Http()
    status, response = http.request(URL)
    links = []
    for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
        if link.has_attr('href'):
            links.append(link['href'])
    links_clean = list(set([l for l in links if "transcript" in l]))
    filtered_links = [link for link in links_clean if 'https://tim.blog/' in link and any(char.isdigit() for char in link) and 'https://tim.blog/20' in link]
    return filtered_links

def get_img(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')
    urls = [img['src'] for img in img_tags]
    profile_img = [i for i in urls if 'scaled' in i]
    return profile_img

def get_year_name(img_link):
    try:
        year = re.findall(r'/(\d{4})/', img_link)[0]
        name = re.search(r'/([^/]+)-Ill', img_link).group(1).lower()
        return year, name 
    except:
        print("Fail On:")
        print(img_link)
        return None, None 

# Store 
d=pd.DataFrame()
for page in range(1,48):
    print("page %s"%page)
    url = "https://tim.blog/category/the-tim-ferriss-show-transcripts/page/%s/"%page
    img_links = get_img(url)
    tx_links=get_links(url)
    for tx_link in tx_links:
        d.loc[tx_link,"transcript"]=tx_link
        d.loc[tx_link,"page"]=page
    for img_link in img_links:
        year, name = get_year_name(img_link)
        if year:
            for tx_link in tx_links:
                if year in tx_link and name in tx_link:
                    d.loc[tx_link,"image"]=img_link

`Split -`

In [None]:
def save_img(episode_id,url):
    response = requests.get(url)
    imgpath="public/0%s.jpg"%episode_id
    with open(imgpath, 'wb') as f:
        if 'http' not in url:
            url = '{}{}'.format(site, url)
        response = requests.get(url)
        f.write(response.content)

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def get_text_and_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # extract the title of the webpage
    title = soup.title.text.strip()
    # find the <div> element that contains the text under the title
    post_content = soup.find('div', class_='entry-content').get_text()
    # extract the text content of the <div> element
    text = post_content.strip()
    return text,title

# Chunk size
chunks = 1500
splits = [ ]
metadatas = [ ]
for ix in d.index:
    try:
        save_img(ix,d.loc[ix,'image'])
    except:
        None
    link = d.loc[ix,"transcript"]
    text,title=get_text_and_title(link)
    # Split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunks, chunk_overlap=50)
    texts_recusive = text_splitter.split_text(text)
    splits.append(texts_recusive)
    print(len(texts_recusive)) 
    metadata=[{"source":str(ix) + " " +link,"id":str(ix),"link":link,"title":title} for chunk in texts_recusive]
    print(len(metadata)) 
    metadatas.append(metadata)


In [None]:
# Join the list of lists 
splits_all = []
for sublist in splits:
    splits_all.extend(sublist)
metadatas_all = []
for sublist in metadatas:
    metadatas_all.extend(sublist)

In [None]:
print(len(splits_all))
print(len(metadatas_all))

`Embed -`

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east1-gcp"  
)
index_name = "ferris-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

In [None]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_all) / chunk_size)
for i in range(last_chunk,num_chunks):
    
    print(i)
    start_time = time.time()
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_all))
    
    # Extract the current chunk
    current_splits = splits_all[start_idx:end_idx]
    current_metadatas = metadatas_all[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")