In [19]:
import pickle
import qg
import spacy
from chunking import ChunkPipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

Get transcript chunks and  get slide chunks

In [4]:
# get the whisper chunks
with open("experiments/qg/comp3074_lecture_2.pkl", "rb") as file:
    whisper_chunks = pickle.load(file)['chunks']

# get the slide chunks with timestamps
with open('slide_chunks.pkl', 'rb') as file:
    slide_chunks = pickle.load(file)

# generate trasncript chunks
qg_model = qg.Model.DISCORD
chunk_pipe = ChunkPipeline(qg_model)
transcript_chunks = chunk_pipe(whisper_chunks,2301)

# get last endtime in the slide chunkss
endtime = slide_chunks[-1][2]

Function to compute the relevant chunk scores, we can pass different similarity functions to this method in order to experiment with the different similarity and embedding methods

In [8]:

def compute_similarity_scores(similarity_function):
    i = 0
    relevant_chunks = []

    for j, chunk in enumerate(transcript_chunks):
        if chunk['timestamp'][0] < endtime:
            list_of_slide_indices = []

            while i < len(slide_chunks):
                list_of_slide_indices.append(i)

                if chunk['timestamp'][1] <= slide_chunks[i][2]:
                    transcript_chunk_text = chunk['text']
                    
                    slide_text = ""
                    for index in list_of_slide_indices:
                        slide_text += slide_chunks[index][0]

                    cosine_sim = similarity_function(transcript_chunk_text, slide_text)
                    relevant_chunks.append(cosine_sim)
                    break
                i += 1

    return relevant_chunks

Using Cosine similarity and tfidf vectoriser

In [5]:
def compute_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]

In [9]:
print(compute_similarity_scores(compute_cosine_similarity))

[0.13935636174542096, 0.20810105609626248, 0.30331727372819217, 0.3136159539992854, 0.1692906811689349, 0.2708466306878146, 0.2278208149700252, 0.19971738020946844, 0.12529564472501387, 0.36182889890856557, 0.4032467141647036, 0.267555699347143]


Using jaccard Similarity and count vectoriser

In [17]:
def compute_jaccard_similarity(text1, text2):
    # Create a CountVectorizer to convert text to a bag-of-words representation
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()

    # Compute Jaccard similarity between the two text vectors
    intersection = sum(min(v1, v2) for v1, v2 in zip(vectors[0], vectors[1]))
    union = sum(max(v1, v2) for v1, v2 in zip(vectors[0], vectors[1]))

    jaccard_sim = intersection / union if union != 0 else 0
    return jaccard_sim

In [18]:
print(compute_similarity_scores(compute_jaccard_similarity))

[0.05615047799930349, 0.10647049747725855, 0.09931176363294138, 0.11702073932873738, 0.07220244239962925, 0.07875526488897058, 0.08874000039070995, 0.06803963266628628, 0.03956687954642075, 0.12353446839549455, 0.13981652977829875, 0.11071600996552473]


Using word embeddings instead of tf_idf

In [27]:
!python -m spacy download en_core_web_md

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [30]:
def compute_cosine_similarity_word_embeddings(text1, text2):
    # Load spaCy model with word embeddings
    nlp = spacy.load("en_core_web_md")

    # Get the word embeddings for each text
    embeddings1 = nlp(text1).vector.reshape(1, -1)
    embeddings2 = nlp(text2).vector.reshape(1, -1)

    # Compute cosine similarity between the two vectors
    cosine_sim = cosine_similarity(embeddings1, embeddings2)[0, 0]
    return cosine_sim

In [31]:
print(compute_similarity_scores(compute_cosine_similarity_word_embeddings))

[0.4070797, 0.63080883, 0.56591886, 0.6583812, 0.7433294, 0.63322276, 0.7421772, 0.7009978, 0.5173513, 0.752879, 0.7874681, 0.7268001]
