In [None]:
import os
#import openai# from dotenv import load_dotenv, find_dotenv
from dotenv import load_dotenv, find_dotenv


#from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatOllama
from langchain.llms import Ollama
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

from langchain.vectorstores import DeepLake

In [None]:
# create the embeddings model
modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}
# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
#!pip install sentence-transformers

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
ACTIVELOOP_TOKEN = os.environ['ACTIVELOOP_TOKEN']
ACTIVELOOP_USERNAME = os.environ['ACTIVELOOP_USERNAME']

#my_activeloop_org_id = "<YOUR-ACTIVELOOP-ORG-ID>"
my_activeloop_dataset_name = "langchain_course_indexers_retrievers"
dataset_path = f"hub://{ACTIVELOOP_USERNAME}/{my_activeloop_dataset_name}"


In [None]:
db_id = 'kb-material'# replace with your database name
#DeepLake.force_delete_by_path(f"hub://{ACTIVELOOP_USERNAME}/{db_id}")

In [None]:
# llm_model = "gpt-3.5-turbo"
# chat_open = ChatOpenAI(temperature=0.0, model=llm_model)

In [None]:
#local_url = "http://localhost:11434"

In [None]:
model = "mistral"

In [None]:
#llm = Ollama(base_url=local_url, model=model)

In [None]:
#print(llm("Why is the sky blue?"))

In [None]:
#print(llm("Wer is Olaf Scholz? Bitte in Deutsch antworten"))


In [None]:
# Ollama embeddings
embeddings = OllamaEmbeddings(model=model)
# OpenAI embeddings
#embedding = OpenAIEmbeddings()

llm_open = Ollama(  model=model,
                    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

In [None]:
print(llm_open("Why is the sky blue?"))

In [None]:
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

# Testing some text

In [None]:


# text to write to a local file
# taken from https://www.theverge.com/2023/3/14/23639313/google-ai-language-model-palm-api-challenge-openai
text = """Google opens up its AI language model PaLM to challenge OpenAI and GPT-3
Google is offering developers access to one of its most advanced AI language models: PaLM.
The search giant is launching an API for PaLM alongside a number of AI enterprise tools
it says will help businesses “generate text, images, code, videos, audio, and more from
simple natural language prompts.”

PaLM is a large language model, or LLM, similar to the GPT series created by OpenAI or
Meta’s LLaMA family of models. Google first announced PaLM in April 2022. Like other LLMs,
PaLM is a flexible system that can potentially carry out all sorts of text generation and
editing tasks. You could train PaLM to be a conversational chatbot like ChatGPT, for
example, or you could use it for tasks like summarizing text or even writing code.
(It’s similar to features Google also announced today for its Workspace apps like Google
Docs and Gmail.)
"""

# write text to local file
with open("my_file.txt", "w") as file:
    file.write(text)

# use TextLoader to load text from local file
loader = TextLoader("my_file.txt")
docs_from_file = loader.load()

print(len(docs_from_file))
# 1

In [None]:
content_directory = "./data/docs/"

In [None]:
loader = DirectoryLoader(content_directory, glob="**/*.txt")
docs = loader.load()

In [None]:
docs[0].metadata["source"]

In [None]:
docs[0].page_content

In [None]:
chunk_size = 300
chunk_overlap = 50

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

all_texts, all_metadatas = [], []

# create a text splitter
#text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

for doc in docs:
    # split documents into chunks
    #print(doc.page_content)
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append(doc.metadata["source"])
#print(zip(all_metadatas, all_texts))


In [None]:
# we add all the chunks to the deep lake, along with their metadata
db.add_texts(all_texts, all_metadatas)
#db.add_documents(docs)

# Setting up QA Chain with Retriever

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

#llm = OpenAI(model_name=model, temperature=0)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm_open,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever())

In [None]:
d_response = chain({"question": "What is the most important dangerous threat in cyber security?"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

## Cleaning srt files

In [None]:
import re
def clean_srt_text(lines):  
    text = ''
    for line in lines:
        if re.search('^[0-9]+$', line) is None and re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}', line) is None and re.search('^$', line) is None:
            line = line.rstrip('\n')
            if line == "": 
                print("Empty line")
            else:
                text = text+ " " + line
    return text

In [None]:
def get_filenames_in_directory(directory):
    filenames = []
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            filenames.append(filename)
    return filenames

# Example usage:
source_directory = "./data/raw/"
target_directory = "./data/clean"


filenames = get_filenames_in_directory(source_directory)
print(filenames)

In [None]:
old_extension = "srt"
new_extension = "txt"


In [None]:
def create_modified_filename(filename, directory, old_extension, new_extension):

    if filename.endswith(old_extension):
        old_path = os.path.join(directory, filename)
        new_filename = filename.rsplit(".", 1)[0] + "." + new_extension
        #new_path = os.path.join(directory, new_filename)
    return new_filename

In [None]:
for filename in filenames:
    print(create_modified_filename(filename, source_directory, old_extension, new_extension))

In [None]:
def create_txt_from_srt(source_directory, target_directory, old_extention="srt", new_extension="txt"):
    filenames = get_filenames_in_directory(source_directory)
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    for filename in filenames:
        
        with open(os.path.join(source_directory, filename), 'r', encoding='utf8') as f:
            lines = f.readlines()
        new_filename = create_modified_filename(filename, source_directory, old_extension, new_extension)
        clean_text = clean_srt_text(lines)

        new_file_path = os.path.join(target_directory, new_filename)
        with open(new_file_path, 'w') as f:
            f.write(clean_text)


In [None]:
create_txt_from_srt(source_directory, target_directory)