In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
import requests
from bs4 import BeautifulSoup


In [3]:
weburl = "https://www.gov.uk/government/publications/research-on-the-cyber-security-of-ai/ai-cyber-security-survey-main-report"
loader = WebBaseLoader(weburl)
loader.requests_kwargs = {'verify':False}
data = loader.load()
data = str(data)



In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=30,
    separators=['.','\n','\n\n']
)

In [5]:
docs = text_splitter.split_text(data)


In [6]:
type(docs)

list

In [8]:
count_vectorizer = CountVectorizer(stop_words="english")
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(docs)
sparse_matrix
doc_term_matrix = sparse_matrix.todense()
doc_term_matrix

matrix([[0, 0, 0, ..., 0, 3, 3],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 2, 3],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
df = pd.DataFrame(
   doc_term_matrix,
   columns=count_vectorizer.get_feature_names_out(),
   index=docs,
)
print(df)

                                                    056  059  10  10th  11  \
[Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\...    0    0   0     0   0   
.UK\n              \n\n\n \n\n\n        Navigat...    0    0   0     0   0   
.UK\n              \n\n\n\n\n \n \n\n\n\n\n\n  ...    0    0   0     0   0   
.UK\n\n\n\n\n        Search\n        \n\n\n\n\n...    0    0   0     0   0   
.gov.uk/doc/open-government-licence/version/3 o...    0    0   0     0   0   
. Fieldwork was conducted using Computer Assist...    0    0   0     1   0   
.\nKey findings\nThe survey sought to create a ...    0    0   0     0   0   
. Among those who have not yet deployed AI but ...    0    0   0     0   0   
. Among those planning to use AI in the future,...    0    0   0     0   0   
. Just under three-quarters (72%) had a formal ...    0    0   0     0   0   
.\n\n2.  Introduction\nBackground\nThe Departme...    0    0   0     0   0   
. The growing use of AI among businesses presen...    0    0   0

In [12]:
len(docs)

55

In [13]:
matrix = cosine_similarity(df, df)
print(cosine_similarity(df, df))

[[1.         0.19935012 0.09117246 ... 0.33628594 0.08185945 0.24539555]
 [0.19935012 1.         0.07488746 ... 0.28800922 0.02281286 0.23210354]
 [0.09117246 0.07488746 1.         ... 0.04852863 0.93214435 0.04888588]
 ...
 [0.33628594 0.28800922 0.04852863 ... 1.         0.0459922  0.19218788]
 [0.08185945 0.02281286 0.93214435 ... 0.0459922  1.         0.0416977 ]
 [0.24539555 0.23210354 0.04888588 ... 0.19218788 0.0416977  1.        ]]


In [17]:
docs[2]

'.UK\\n              \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n                  Services and information\\n                \\n\\n\\nBenefits\\n\\n\\nBirths, death, marriages and care\\n\\n\\nBusiness and self-employed\\n\\n\\nChildcare and parenting\\n\\n\\nCitizenship and living in the UK\\n\\n\\nCrime, justice and the law\\n\\n\\nDisabled people\\n\\n\\nDriving and transport\\n\\n\\nEducation and learning\\n\\n\\nEmploying people\\n\\n\\nEnvironment and countryside\\n\\n\\nHousing and local services\\n\\n\\nMoney and tax\\n\\n\\nPassports, travel and living abroad\\n\\n\\nVisas and immigration\\n\\n\\nWorking, jobs and pensions\\n\\n\\n\\n\\n\\n                  Government activity\\n                \\n\\n\\nDepartments\\nDepartments, agencies and public bodies\\n\\n\\nNews\\nNews stories, speeches, letters and notices\\n\\n\\nGuidance and regulation\\nDetailed guidance, regulations and rules\\n\\n\\nResearch and statistics\\nReports, analysis and official statistics\\n\\n\\nPolicy pape

In [16]:
docs[53]

'.\\n      \\n\\n        Cancel\\n      \\n\\n\\n\\n\\n\\n\\n\\n\\n\\nServices and information\\n\\n\\nBenefits\\n\\n\\nBirths, death, marriages and care\\n\\n\\nBusiness and self-employed\\n\\n\\nChildcare and parenting\\n\\n\\nCitizenship and living in the UK\\n\\n\\nCrime, justice and the law\\n\\n\\nDisabled people\\n\\n\\nDriving and transport\\n\\n\\nEducation and learning\\n\\n\\nEmploying people\\n\\n\\nEnvironment and countryside\\n\\n\\nHousing and local services\\n\\n\\nMoney and tax\\n\\n\\nPassports, travel and living abroad\\n\\n\\nVisas and immigration\\n\\n\\nWorking, jobs and pensions\\n\\n\\n\\n\\nGovernment activity\\n\\n\\nDepartments\\n\\n\\nNews\\n\\n\\nGuidance and regulation\\n\\n\\nResearch and statistics\\n\\n\\nPolicy papers and consultations\\n\\n\\nTransparency\\n\\n\\nHow government works\\n\\n\\nGet involved\\n\\n\\n\\n\\n\\n\\n\\nSupport links\\n\\n\\nHelp\\n\\n\\nPrivacy\\n\\n\\nCookies\\n\\n\\nAccessibility statement\\n\\n\\nContact\\n\\n\\nTerms and c