In [1]:
import sklearn
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = '''Q1) a) List the requirements for good hash function. And explain how it ensures
the integrity of data. [8]
b) List difference between MD5 & SHA. [4]
c) Explain Diffe-Hellman key exchange algorithm with example. [5]
OR
Q2) a) Explain the RSA algorithm with an example. [8]
b) Explain MD5 with an example. [9]
Q3) a) What are the main similarity and differences between qualitative risk
analysis Vs quantitative risk analysis? [9]
b) What is cyber stalking and how to prevent it? Enlist example of cyber
stalking. [9]
OR
Q4) a) What do you mean by ethics in information security? How ethics is
important in information security? What are the ethical issues in
information security? [9]
b) Describe classification of cyber crime in detail. [9]
Q5) a) What is SSL? Explain handshake protocol of SSL? [8]
b) What are various types of firewall? Discuss limitations of firewall. [9]
OR
P.T.O.
CEGP013091
49.248.216.238 28/07/2022 09:06:16 static-238
CEGP013091
49.248.216.238 28/07/2022 09:06:16 static-238
CEGP013091
49.248.216.238 28/07/2022 09:06:16 static-238
2 [5870]-1206
Q6) a) Explain p~p algorithm in details. [8]
b) Describe briefly how IPsec works and enlist it’s applications. Distinguish
between tunnel and transport mode of IPsec. [9]
Q7) a) Explain need and challenges of intrusion detection system. Define signature
based IDS. [9]
b) What is computer worm and virus? How does computer virus spread?
How to protect against computer virus and worms. [9]
OR
Q8) a) Differentiate spyware, Adware & Ransomware. [9]
b) Define phishing. Explain phishing with types and examples.
Q1) a) What is cryptographic hash function? How is it useful in cryptography?
List different cryptographic hash functions. Explain in detail any one
cryptographic hash function.__________. [8]
b) Find the key exchanged between Alok and Bobby considering following
data n = 11, g = 5, x = 2, y = 3. Find the value of A,B & key K. [9]
OR
Q2) a) What are steps carried out in diffie hellman algorithm? List uses,
advantages and disadvantages of diffie hellman algo. [8]
b) What do you mean by Asymmetric cryptography algorithm? Explain RSA
algorithm in detail. [9]
Q3) a) Describe different categories of cybercrime with example. [9]
b) Explain the process of risk identification and risk assessment. [9]
OR
Q4) a) What are the difference between quantitative and qualitative risk analysis
with providing examples. [9]
b) What is cyber stalking? How to identify and detect cyber stalking. [9]
Q5) a) What is SSL? How does SSL works? Why is SSL important. [8]
b) Describe IPSec protocol with its components and security
services. [9]
OR
CEGP013091
49.248.216.238 24/01/2023 13:59:57 static-238
CEGP013091
49.248.216.238 24/01/2023 13:59:57 static-238
CEGP013091
49.248.216.238 24/01/2023 13:59:57 static-238
[5926]-256 2
Q6) a) What is the firewall? How does it works & explain different types of
firewalls. [8]
b) What is email security and why it is necessary? Explain any one algorithm
used for email security. [9]
Q7) a) What is malware? Enlist different types of malware what precaution needs
to protect from malware. [9]
b) What is computer worm or virus? How does computer virus spread?
How to protect against computer virus and norms. [9]
OR
Q8) a) Enlist different types of IDS. Describe any one type of IDS in
detail. [9]
b) Define phishing. Explain phishing with types and examples. [9''' 

In [4]:
doc = nlp(text)

In [5]:
doc

Q1) a) List the requirements for good hash function. And explain how it ensures
the integrity of data. [8]
b) List difference between MD5 & SHA. [4]
c) Explain Diffe-Hellman key exchange algorithm with example. [5]
OR
Q2) a) Explain the RSA algorithm with an example. [8]
b) Explain MD5 with an example. [9]
Q3) a) What are the main similarity and differences between qualitative risk
analysis Vs quantitative risk analysis? [9]
b) What is cyber stalking and how to prevent it? Enlist example of cyber
stalking. [9]
OR
Q4) a) What do you mean by ethics in information security? How ethics is
important in information security? What are the ethical issues in
information security? [9]
b) Describe classification of cyber crime in detail. [9]
Q5) a) What is SSL? Explain handshake protocol of SSL? [8]
b) What are various types of firewall? Discuss limitations of firewall. [9]
OR
P.T.O.
CEGP013091
49.248.216.238 28/07/2022 09:06:16 static-238
CEGP013091
49.248.216.238 28/07/2022 09:06:16 static-238


In [6]:
# extracting sentences from text:
sentences = [sent.text for sent in doc.sents]
sentences

['Q1) a) List the requirements for good hash function.',
 'And explain how it ensures\nthe integrity of data.',
 '[8]\nb) List difference between MD5 & SHA.',
 '[4]\nc) Explain Diffe-Hellman key exchange algorithm with example.',
 '[5]\nOR\nQ2) a) Explain the RSA algorithm with an example.',
 '[8]\nb) Explain MD5 with an example.',
 '[9]\nQ3) a)',
 'What are the main similarity and differences between qualitative risk\nanalysis Vs quantitative risk analysis?',
 '[9]\nb) What is cyber stalking and how to prevent it?',
 'Enlist example of cyber\nstalking.',
 '[9]\nOR\nQ4) a)',
 'What do you mean by ethics in information security?',
 'How ethics is\nimportant in information security?',
 'What are the ethical issues in\ninformation security?',
 '[9]\nb) Describe classification of cyber crime in detail.',
 '[9]\nQ5) a)',
 'What is SSL?',
 'Explain handshake protocol of SSL?',
 '[8]\nb)',
 'What are various types of firewall?',
 'Discuss limitations of firewall.',
 '[9]\nOR\nP.T.O.\nCEGP0130

In [7]:
# Remove stop words from each sentence
def remove_stopwords(sentence):
    doc = nlp(sentence)
    return ' '.join([token.text for token in doc if not token.is_stop])

In [8]:
cleaned_sentences = [remove_stopwords(sentence) for sentence in sentences]
cleaned_sentences

['Q1 ) ) List requirements good hash function .',
 'explain ensures \n integrity data .',
 '[ 8 ] \n b ) List difference MD5 & SHA .',
 '[ 4 ] \n c ) Explain Diffe - Hellman key exchange algorithm example .',
 '[ 5 ] \n \n Q2 ) ) Explain RSA algorithm example .',
 '[ 8 ] \n b ) Explain MD5 example .',
 '[ 9 ] \n Q3 ) )',
 'main similarity differences qualitative risk \n analysis Vs quantitative risk analysis ?',
 '[ 9 ] \n b ) cyber stalking prevent ?',
 'Enlist example cyber \n stalking .',
 '[ 9 ] \n \n Q4 ) )',
 'mean ethics information security ?',
 'ethics \n important information security ?',
 'ethical issues \n information security ?',
 '[ 9 ] \n b ) Describe classification cyber crime detail .',
 '[ 9 ] \n Q5 ) )',
 'SSL ?',
 'Explain handshake protocol SSL ?',
 '[ 8 ] \n b )',
 'types firewall ?',
 'Discuss limitations firewall .',
 '[ 9 ] \n \n P.T.O. \n CEGP013091 \n 49.248.216.238 28/07/2022 09:06:16 static-238 \n CEGP013091 \n 49.248.216.238 28/07/2022 09:06:16 static-238 

In [9]:
# Convert the cleaned sentences to TF-IDF vectors
vectorizer = TfidfVectorizer().fit_transform(cleaned_sentences)
vectors = vectorizer.toarray()

In [10]:
# Compute cosine similarity between the sentences
cos_sim_matrix = cosine_similarity(vectors)
cos_sim_matrix

array([[1.        , 0.        , 0.16094524, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.12127212,
        0.        ],
       [0.16094524, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.3695389 ,
        0.        ],
       [0.        , 0.12127212, 0.        , ..., 0.3695389 , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
# Find pairs of sentences with cosine similarity greater than 0.4
threshold = 0.4
pairs_above_threshold = np.where(cos_sim_matrix > threshold)
pairs = [(i, j) for i, j in zip(pairs_above_threshold[0], pairs_above_threshold[1]) if i < j]

In [13]:
# Group sentences based on their similarity scores
groups = []
for i, j in pairs:
    added = False
    for group in groups:
        if i in group or j in group:
            group.update([i, j])
            added = True
            break
    if not added:
        groups.append(set([i, j]))

In [14]:
# Display the groups of similar sentences
print("Groups of similar sentences (cosine similarity > 0.4):")
for idx, group in enumerate(groups):
    print(f"\nGroup {idx + 1}:")
    for sentence_idx in group:
        print(f"Sentence: {sentences[sentence_idx]}")

Groups of similar sentences (cosine similarity > 0.4):

Group 1:
Sentence: Q1) a) List the requirements for good hash function.
Sentence: Q1)
Sentence: What is cryptographic hash function?
Sentence: List different cryptographic hash functions.
Sentence: Explain in detail any one
cryptographic hash function.__________.

Group 2:
Sentence: [5]
OR
Q2) a) Explain the RSA algorithm with an example.
Sentence: Explain RSA
algorithm in detail.

Group 3:
Sentence: [9]
Q3) a) Describe different categories of cybercrime with example.
Sentence: [9]
Q3) a)

Group 4:
Sentence: What are the difference between quantitative and qualitative risk analysis
with providing examples.
Sentence: What are the main similarity and differences between qualitative risk
analysis Vs quantitative risk analysis?

Group 5:
Sentence: [9]
b) What is cyber stalking and how to prevent it?
Sentence: Enlist example of cyber
stalking.
Sentence: [9]
b) What is cyber stalking?
Sentence: How to identify and detect cyber stalking.

In [38]:
# Find the top 5 most similar sentences
# similarity_scores = cos_sim_matrix[np.triu_indices_from(cos_sim_matrix, k=1)]
# top_indices = np.argsort(similarity_scores)[-10:][::-1]
# most_similar_pairs = [(np.triu_indices_from(cos_sim_matrix, k=1)[0][i], 
#                        np.triu_indices_from(cos_sim_matrix, k=1)[1][i]) for i in top_indices]

In [39]:
# Display the top 5 most similar sentences
# print("Top 5 most similar sentences:")
# for idx, (i, j) in enumerate(most_similar_pairs):
#     print(f"\nPair {idx + 1}:")
#     print(f"Sentence 1: {sentences[i]}")
#     print(f"Sentence 2: {sentences[j]}")
#     print(f"Cosine Similarity: {similarity_scores[top_indices[idx]]:.4f}")