In [1]:
import os
#import openai# from dotenv import load_dotenv, find_dotenv
from dotenv import load_dotenv, find_dotenv


#from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatOllama
from langchain.llms import Ollama
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import DeepLake

In [22]:
_ = load_dotenv(find_dotenv()) # read local .env file
ACTIVELOOP_TOKEN = os.environ['ACTIVELOOP_TOKEN']
ACTIVELOOP_USERNAME = os.environ['ACTIVELOOP_USERNAME']

#my_activeloop_org_id = "<YOUR-ACTIVELOOP-ORG-ID>"
my_activeloop_dataset_name = "UD_Cybersecurity"
dataset_path = f"hub://{ACTIVELOOP_USERNAME}/{my_activeloop_dataset_name}"


In [30]:
#DeepLake.force_delete_by_path(f"hub://{ACTIVELOOP_USERNAME}/{my_activeloop_dataset_name}")

 

In [5]:
# llm_model = "gpt-3.5-turbo"
# chat_open = ChatOpenAI(temperature=0.0, model=llm_model)

In [6]:
#print(llm("Wer is Olaf Scholz? Bitte in Deutsch antworten"))


In [2]:
# create the embeddings model
modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}
# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [7]:
model = "mistral"

In [23]:
# Ollama embeddings
embeddings = OllamaEmbeddings(model=model)
# OpenAI embeddings
#embedding = OpenAIEmbeddings()

llm_open = Ollama(  model=model,
                    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

In [24]:
#print(llm_open("Why is the sky blue?"))

In [31]:
db = DeepLake(dataset_path=dataset_path, embedding=embeddings)
#db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

Your Deep Lake dataset has been successfully created!


 

## Setting up folders

In [11]:
def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

In [12]:
source_directory = "./data/raw/"
target_directory = "./data/clean"
content_directory = "./data/docs/"

In [13]:
create_directory_if_not_exists(source_directory)
create_directory_if_not_exists(target_directory)
create_directory_if_not_exists(content_directory)

Directory './data/raw/' already exists.
Directory './data/clean' already exists.
Directory './data/docs/' already exists.


# Cleaning srt files

In [23]:
import re
def clean_srt_text(lines):  
    text = ''
    for line in lines:
        if re.search('^[0-9]+$', line) is None and re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}', line) is None and re.search('^$', line) is None:
            line = line.rstrip('\n')
            if line == "": 
                print("Empty line")
            else:
                text = text+ " " + line
    return text

In [27]:
def get_filenames_in_directory(directory):
    filenames = []
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            filenames.append(filename)
    return filenames

# Example usage:
#source_directory = "./data/raw/"



filenames = get_filenames_in_directory(source_directory)
print(filenames)

['8 - ND0035 ESND C1  L1 A09 Project Preview V1 - lang_en.srt', '1 - ND0035 ESND C1  L1 A01 Meet Your Instructor V1 - lang_en.srt', '4 - ND0035 ESND C1  L1 A04 Course Overview V1 - lang_en.srt', '9 - ND0035 ESND C1  L1 A10 Lets Get Started V2 - lang_en.srt', '5 - ND0035 ESND C1  L1 A05 Why Network Security Is Important V1 - lang_en.srt', '3 - ND0035 ESND C1  L1 A03 Course Prerequisites V1 - lang_en.srt', '2 - ND0035 ESND C1  L1 A02 Lesson Overview V1 - lang_en.srt', '7 - ND0035 ESND C1  L1 A07 Business Stakeholders V1 - lang_en.srt', '6 - ND0035 ESND C1  L1 A06 Evolution Of Perimeter And Network Security V1 - lang_en.srt']


In [28]:
old_extension = "srt"
new_extension = "txt"

In [29]:
def create_modified_filename(filename, directory, old_extension, new_extension):

    if filename.endswith(old_extension):
        old_path = os.path.join(directory, filename)
        new_filename = filename.rsplit(".", 1)[0] + "." + new_extension
        #new_path = os.path.join(directory, new_filename)
    return new_filename

In [30]:
for filename in filenames:
    print(create_modified_filename(filename, source_directory, old_extension, new_extension))

8 - ND0035 ESND C1  L1 A09 Project Preview V1 - lang_en.txt
1 - ND0035 ESND C1  L1 A01 Meet Your Instructor V1 - lang_en.txt
4 - ND0035 ESND C1  L1 A04 Course Overview V1 - lang_en.txt
9 - ND0035 ESND C1  L1 A10 Lets Get Started V2 - lang_en.txt
5 - ND0035 ESND C1  L1 A05 Why Network Security Is Important V1 - lang_en.txt
3 - ND0035 ESND C1  L1 A03 Course Prerequisites V1 - lang_en.txt
2 - ND0035 ESND C1  L1 A02 Lesson Overview V1 - lang_en.txt
7 - ND0035 ESND C1  L1 A07 Business Stakeholders V1 - lang_en.txt
6 - ND0035 ESND C1  L1 A06 Evolution Of Perimeter And Network Security V1 - lang_en.txt


In [31]:
def create_txt_from_srt(source_directory, target_directory, old_extention="srt", new_extension="txt"):
    filenames = get_filenames_in_directory(source_directory)
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    for filename in filenames:
        
        with open(os.path.join(source_directory, filename), 'r', encoding='utf8') as f:
            lines = f.readlines()
        new_filename = create_modified_filename(filename, source_directory, old_extension, new_extension)
        clean_text = clean_srt_text(lines)

        new_file_path = os.path.join(target_directory, new_filename)
        with open(new_file_path, 'w') as f:
            f.write(clean_text)


In [32]:
create_txt_from_srt(source_directory, target_directory)

# Creating a document loader

In [32]:
loader = DirectoryLoader(target_directory, glob="**/*.txt")
docs = loader.load()

chunk_size = 300
chunk_overlap = 50





# create a text splitter
#text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

for doc in docs:
    all_texts, all_metadatas = [], []
    # split documents into chunks
    #print(doc.page_content)
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({"source": doc.metadata["source"]})
#print(zip(all_metadatas, all_texts))

# we add all the chunks to the deep lake, along with their metadata
    db.add_texts(all_texts, all_metadatas)
#db.add_documents(docs)

Creating 3 embeddings in 1 batches of size 3:: 100%|██████████| 1/1 [00:55<00:00, 55.43s/it]

Dataset(path='hub://subrockmann/UD_Cybersecurity', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (3, 1)      str     None   
 metadata     json      (3, 1)      str     None   
 embedding  embedding  (3, 4096)  float32   None   
    id        text      (3, 1)      str     None   





# Setting up QA Chain with Retriever

In [42]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

#llm = OpenAI(model_name=model, temperature=0)

retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['k'] = 2

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm_open,
                                                    chain_type="stuff",
                                                    retriever=retriever)

In [16]:
from langchain.chains import RetrievalQA

In [41]:
# create a retrieval chain
qa_chain = RetrievalQA.from_chain_type(
	llm=llm_open,
	chain_type="stuff",
	retriever=retriever
)

In [35]:
question = "What is zero trust?"

In [36]:
docs = retriever.get_relevant_documents(question)

In [37]:
docs

[Document(page_content="and I also teach a course at Columbia University. I have a Master's in Information Technology from NYU, and I did my undergrad at Parker's in Computer Science and Psychology.", metadata={'source': 'data/clean/1 - ND0035 ESND C1  L1 A01 Meet Your Instructor V1 - lang_en.txt'}),
 Document(page_content="me protecting the enterprise and journalists all over the world from cyber espionage, state-sponsored attacks, and I've been working in tech for close to 10 years now. I hope to impart my knowledge onto you. I believe in teaching practical security, and I also teach a course at Columbia University.", metadata={'source': 'data/clean/1 - ND0035 ESND C1  L1 A01 Meet Your Instructor V1 - lang_en.txt'})]

In [38]:
qa_chain.run(question)

 Zero Trust is a cybersecurity strategy that requires strict identity verification for every access request, regardless of the user's location or device. It aims to protect enterprises from both internal and external threats by eliminating implicit trust in users and devices. This approach assumes that breaches have already occurred and focuses on minimizing damage by implementing multi-factor authentication, microsegmentation, and continuous monitoring. I teach this concept extensively in my courses at Columbia University.

" Zero Trust is a cybersecurity strategy that requires strict identity verification for every access request, regardless of the user's location or device. It aims to protect enterprises from both internal and external threats by eliminating implicit trust in users and devices. This approach assumes that breaches have already occurred and focuses on minimizing damage by implementing multi-factor authentication, microsegmentation, and continuous monitoring. I teach this concept extensively in my courses at Columbia University."

In [44]:
chain.invoke(question)

KeyboardInterrupt: 

In [40]:
d_response = chain.invoke(question)

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

KeyboardInterrupt: 