In [15]:
import pandas as pd
import numpy as np

In [16]:
# Load dataset
path = '/content/drive/MyDrive/Colab Notebooks/CSC 820/transcripts.csv'
data = pd.read_csv(path)

In [17]:
# Adjust sample size depending on amount of time you have to run program
reduced_data = data.sample(frac=0.01, random_state=42)
documents = reduced_data['transcript'].tolist()
# Query strings
query1 = "life learning"
query2 = "hard work"
query3 = "self actualization"

In [18]:
# Normalized Term Frequency
def termFrequency(term, document):
    normalizeDocument = document.lower().split()
    return normalizeDocument.count(term.lower()) / float(len(normalizeDocument))


In [19]:
# Compute normalized term frequency for a list of documents
def compute_normalizedtf(documents):
    tf_doc = []
    for txt in documents:
        sentence = txt.split()
        norm_tf = dict.fromkeys(set(sentence), 0)
        for word in sentence:
            norm_tf[word] = termFrequency(word, txt)
        tf_doc.append(norm_tf)
    return tf_doc
tf_doc = compute_normalizedtf(documents)

In [20]:
# Compute term frequency for the query string
def compute_query_tf(query):
    query_norm_tf = {}
    tokens = query.split()
    for word in tokens:
        query_norm_tf[word] = termFrequency(word, query)
    return query_norm_tf

query_norm_tf = compute_query_tf(query1)

In [21]:
# Calculate cosine similarity using only term frequency
def cosine_similarity(query_tf, document_tf, query):
    dot_product = 0
    qry_mod = 0
    doc_mod = 0
    tokens = query.split()

    for word in tokens:
        word_tf_query = query_tf.get(word, 0)
        word_tf_doc = document_tf.get(word, 0)
        dot_product += word_tf_query * word_tf_doc
        qry_mod += word_tf_query ** 2
        doc_mod += word_tf_doc ** 2

    qry_mod = np.sqrt(qry_mod)
    doc_mod = np.sqrt(doc_mod)
    if qry_mod * doc_mod == 0:
        return 0  # To handle division by zero if either vector magnitude is zero
    return dot_product / (qry_mod * doc_mod)

# Compute cosine similarity for each document
def rank_similarity_docs(documents, query_tf):
    cos_sim = []
    for index, doc_tf in enumerate(tf_doc):
        cos_sim.append(cosine_similarity(query_tf, doc_tf, query))
    return cos_sim

similarity_scores = rank_similarity_docs(documents, query_norm_tf)

# Print the similarity scores
for doc_index, score in enumerate(similarity_scores):
    print(f"Document {doc_index + 1}: Cosine Similarity = {score:.4f}")

Document 1: Cosine Similarity = 0.0000
Document 2: Cosine Similarity = 0.9487
Document 3: Cosine Similarity = 0.7071
Document 4: Cosine Similarity = 0.0000
Document 5: Cosine Similarity = 0.0000
Document 6: Cosine Similarity = 0.8321
Document 7: Cosine Similarity = 0.7071
Document 8: Cosine Similarity = 0.7071
Document 9: Cosine Similarity = 0.0000
Document 10: Cosine Similarity = 0.7071
Document 11: Cosine Similarity = 0.0000
Document 12: Cosine Similarity = 1.0000
Document 13: Cosine Similarity = 0.0000
Document 14: Cosine Similarity = 0.7071
Document 15: Cosine Similarity = 0.0000
Document 16: Cosine Similarity = 0.0000
Document 17: Cosine Similarity = 0.0000
Document 18: Cosine Similarity = 0.7071
Document 19: Cosine Similarity = 0.7071
Document 20: Cosine Similarity = 0.7071
Document 21: Cosine Similarity = 0.0000
Document 22: Cosine Similarity = 0.0000
Document 23: Cosine Similarity = 0.0000
Document 24: Cosine Similarity = 0.0000
Document 25: Cosine Similarity = 0.0000
