In [20]:
# from tqdm.notebook import tqdm
# import time

# for i in tqdm(range(5)):
#     time.sleep(2)
#     print('Iteration:', i)

  0%|          | 0/5 [00:00<?, ?it/s]

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


Data Pipeline includes:
- modify the pipeline to segment data by concepts
    - summary of the parent link
    - 1000/2000 tokens max for each vector
    - whole document broken down into chunks of txt
    - metadata: parent link, parent file name, children chunks file names
    - folder on text analytics
- parse the entirety of spreadsheet (ask for a folder on Deep Dish server)
- allow user input to add more link to do

### Setup

#### Interpreter

In [1]:
# checking interpreter path
import sys
sys.executable

'/root/MLDS/LLM-Product-Assistant/.venv/bin/python'

#### LLM keys and initialization

In [2]:
from dotenv import load_dotenv
import os

In [3]:
%reload_ext dotenv
%dotenv
chatGPT_api_key = os.getenv("OPENAI_API_KEY")
# chatGPT_api_key

In [4]:
# Define LLM
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k", openai_api_key= chatGPT_api_key)

In [5]:
MAX_TOKEN = 16385 # max token length for OpenAI gpt-3.5-turbo-16k

#### Imports

In [6]:
import pandas as pd
import re
import yaml
from tqdm.notebook import tqdm

from langchain.document_loaders import UnstructuredURLLoader
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

#### Data

In [7]:
#read the file
file_path = r'data/VPC Links List - Sheet1.csv'
df = pd.read_csv(file_path)
sample = df.head()
sample

Unnamed: 0,DESC,LINK
0,vpc-lattice vpc-lattice,https://docs.aws.amazon.com/cli/latest/referen...
1,vpc-lattice APIReference Welcome,https://docs.aws.amazon.com/vpc-lattice/latest...
2,APIReference API_Operations,https://docs.aws.amazon.com/vpc-lattice/latest...
3,APIReference API_BatchUpdateRule,https://docs.aws.amazon.com/vpc-lattice/latest...
4,APIReference API_CreateAccessLogSubscription,https://docs.aws.amazon.com/vpc-lattice/latest...


In [8]:
sample_data = sample.set_index('DESC')['LINK'].to_dict()
URLs = list(sample_data.values())
URLs

['https://docs.aws.amazon.com/cli/latest/reference/vpc-lattice/',
 'https://docs.aws.amazon.com/vpc-lattice/latest/APIReference/Welcome.html',
 'https://docs.aws.amazon.com/vpc-lattice/latest/APIReference/API_Operations.html',
 'https://docs.aws.amazon.com/vpc-lattice/latest/APIReference/API_BatchUpdateRule.html',
 'https://docs.aws.amazon.com/vpc-lattice/latest/APIReference/API_CreateAccessLogSubscription.html']

### URL loader

For testing, we will first try AWS PrivateLink - LangChain allows us to pass a list of URLs for parsing, so we will start with **concept** and **getting started**. Here we are using `UnstructuredURLLoader`, the other two [options](https://python.langchain.com/docs/integrations/document_loaders/url) are `SeleniumURLLoader` and `PlaywrightURLLoader`. 

In [9]:
loader = UnstructuredURLLoader(urls=URLs, mode='single', show_progress_bar=True)
data = loader.load()

100%|██████████| 5/5 [00:02<00:00,  1.93it/s]


In [10]:
len(data)

5

In [11]:
data[0].page_content

'AWS CLI Command Reference\n\nHome\n\nUser Guide\n\nForum\n\nGitHub\n\nNavigation\n\nindex\n\nnext |\n\nprevious |\n\nAWS CLI 1.29.71 Command Reference »\n\naws »\n\n← update-watchlist\n          /\n\nbatch-update-rule →\n\nTable Of Contents\n\nvpc-lattice\nDescription\nAvailable Commands\n\nQuick search\n\nFeedback\n\nDid you find this page useful? Do you have a suggestion to improve the documentation?\n\nGive us feedback.\n\nIf you would like to suggest an improvement or fix for the AWS CLI, check out our\n\ncontributing guide on GitHub.\n\nUser Guide\n\nFirst time using the AWS CLI? See the\n  User Guide for\n  help getting started.\n\nNote:\n                        You are viewing the documentation for an older major version of the AWS CLI (version 1).\n\nAWS CLI version 2, the latest major version of AWS CLI, is now stable and recommended for general use.\n                        \n                            To view this page for the AWS CLI version 2, click\n                    

In [12]:
data[0].metadata['source']

'https://docs.aws.amazon.com/cli/latest/reference/vpc-lattice/'

In [13]:
single_doc = [data[0]]

### Summarize

Here, we are summarizing using the stuff chain - it takes a list of document (here, it's a single document in the list), and then summarize using the prompt provided

In [14]:
# stuff
SUMMARIZE_PROMPT = """Write a concise summary of the following:
"{text}"
"""
def summarize_with_stuff(data, llm, summarize_prompt):
    """summarize one document with stuff chain"""
    prompt = PromptTemplate.from_template(SUMMARIZE_PROMPT)
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    return stuff_chain.run(data)

summarize_with_stuff(single_doc, llm, summarize_prompt=SUMMARIZE_PROMPT)

'The AWS CLI Command Reference provides information on the vpc-lattice command, which is used for managing Amazon VPC Lattice, a fully managed application networking service. The reference includes a list of available commands for managing various aspects of VPC Lattice, such as creating and deleting resources, getting information, and updating configurations. It also provides links to the User Guide and other resources for further assistance.'

### Chunking



In [15]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=0, 
    # separators=["\n", "\n\n", "|", "||", "\n|"],
    length_function=len, 
    is_separator_regex=True,
    keep_separator=False
)

def process_text(chunked_text):
    # Replace '/n' with '\nb', remove newline characters, replace '¶' with '', and replace multiple spaces with a single space
    processed_text = [re.sub(r'/n', '\\nb', text) for text in chunked_text]
    processed_text = [re.sub(r'[\n\r]', '', text) for text in processed_text]
    processed_text = [re.sub(r'¶', ' ', text) for text in processed_text]
    processed_text = [re.sub(r'\s+', ' ', text) for text in processed_text]
    return processed_text

chunked_text = text_splitter.split_text(data[0].page_content)
processed_chunked_text = process_text(chunked_text)
processed_chunked_text

['AWS CLI Command ReferenceHomeUser GuideForumGitHubNavigationindexnext |previous |AWS CLI 1.29.71 Command Reference »aws »← update-watchlist /batch-update-rule →Table Of Contentsvpc-latticeDescriptionAvailable CommandsQuick searchFeedbackDid you find this page useful? Do you have a suggestion to improve the documentation?Give us feedback.If you would like to suggest an improvement or fix for the AWS CLI, check out ourcontributing guide on GitHub.User GuideFirst time using the AWS CLI? See the User Guide for help getting started.Note: You are viewing the documentation for an older major version of the AWS CLI (version 1).',
 'AWS CLI version 2, the latest major version of AWS CLI, is now stable and recommended for general use. To view this page for the AWS CLI version 2, click here. For more information see the AWS CLI version 2 installation instructions and migration guide.[ aws ]vpc-lattice Description Amazon VPC Lattice is a fully managed application networking service that you use 

### Main

In [16]:
!mkdir data/summaries/
!mkdir data/chunks/

mkdir: cannot create directory ‘data/summaries/’: File exists


mkdir: cannot create directory ‘data/chunks/’: File exists


In [17]:
yml_data = {}
full_data = df.set_index('DESC')['LINK'].to_dict()
len(full_data) # ~800 links

833

In [None]:
for desc, link in tqdm(sample_data.items()):
    print(f'Processing {desc}')
    # load URL
    loader = UnstructuredURLLoader(urls=[link], mode='single', show_progress_bar=False)
    doc = loader.load()
    
    # create summarization
    summary = summarize_with_stuff(doc, llm, summarize_prompt=SUMMARIZE_PROMPT)
    with open(f'data/summaries/{desc}.txt', 'w') as f:
        f.write(summary)
    # print('Summary created')
        
    # create chunks
    chunked_text = text_splitter.split_text(doc[0].page_content)
    processed_chunked_text = process_text(chunked_text)
    for j, chunk in enumerate(processed_chunked_text):
        with open(f'data/chunks/{desc}_{j}.txt', 'w') as f:
            f.write(chunk)
    # print('Chunked')
            
    # add information to yml file
    yml_data[desc] = {
        'link': link,
        'chunks': [f'{desc}_{j}.txt' for j in range(len(processed_chunked_text))],
        'summary': f'{desc}.txt'
    }
    
with open('data/chunking.yml', 'w') as f:
    yaml.dump(yml_data, f)