In [2]:
import nltk
nltk.download('punkt_tab') # Download the 'punkt_tab' data for sentence tokenization

import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from math import log, sqrt

# Sample documents
documents = [
    "cat dog mouse",
    "dog tiger cat",
    "mouse elephant dog"
]

# Query
query = "cat dog"

# Tokenize documents
doc_tokens = [word_tokenize(doc.lower()) for doc in documents]
query_tokens = word_tokenize(query.lower())

# Build vocabulary
vocab = set(word for doc in doc_tokens for word in doc)

# Compute TF
def compute_tf(tokens):
    tf = Counter(tokens)
    return {word: 1 + log(count) if count > 0 else 0 for word, count in tf.items()}

doc_tf = [compute_tf(doc) for doc in doc_tokens]
query_tf = compute_tf(query_tokens)

# Compute IDF
N = len(documents)
idf = {}
for word in vocab:
    df = sum(1 for doc in doc_tokens if word in doc)
    idf[word] = 1 + log(N / df) if df > 0 else 0

# Compute TF-IDF
for tf in doc_tf:
    for word in tf:
        tf[word] *= idf[word]
for word in query_tf:
    query_tf[word] *= idf[word]

# Convert to vectors
def to_vector(tf_dict):
    return np.array([tf_dict.get(word, 0) for word in vocab])

doc_vectors = [to_vector(tf) for tf in doc_tf]
query_vector = to_vector(query_tf)

# Compute Cosine Similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = sqrt(np.dot(vec1, vec1))
    norm2 = sqrt(np.dot(vec2, vec2))
    return dot_product / (norm1 * norm2) if norm1 and norm2 else 0

# Compute similarities
similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_vectors]

# Display results
for i, score in enumerate(similarities):
    print(f"Similarity with Document {i+1}: {score:.4f}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Similarity with Document 1: 0.7752
Similarity with Document 2: 0.6350
Similarity with Document 3: 0.2134


In [3]:
import pandas as pd

# Load dataset
dataset_file = "/content/stories - Sheet1.csv"
df = pd.read_csv(dataset_file)

# Display column names
print("Column Names:", df.columns.tolist())


Column Names: ['Title', 'Story-text', 'Moral']


In [7]:
import numpy as np
import nltk
import cv2
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from math import log, sqrt

# Load dataset from a CSV file
def load_documents(filename, column_name):
    df = pd.read_csv(filename)
    return df[column_name].dropna().astype(str).tolist()

# Sample dataset file (replace with your actual dataset file path)
dataset_file = "/content/stories - Sheet1.csv"
documents = load_documents(dataset_file, "Story-text")  # Updated with actual column name

# Query
query = "leo"

# Tokenize documents and query
doc_tokens = [word_tokenize(doc.lower()) for doc in documents]
query_tokens = word_tokenize(query.lower())

# Build vocabulary including words from both documents and query
vocab = set(word for doc in doc_tokens for word in doc)
vocab.update(query_tokens) # Add query tokens to vocabulary

# Compute TF
def compute_tf(tokens):
    tf = Counter(tokens)
    return {word: 1 + log(count) if count > 0 else 0 for word, count in tf.items()}

doc_tf = [compute_tf(doc) for doc in doc_tokens]
query_tf = compute_tf(query_tokens)

# Compute IDF
N = len(documents)
idf = {}
for word in vocab:
    df = sum(1 for doc in doc_tokens if word in doc)
    idf[word] = 1 + log(N / df) if df > 0 else 0

# Compute TF-IDF
for tf in doc_tf:
    for word in tf:
        tf[word] *= idf[word]
for word in query_tf:
    query_tf[word] *= idf[word]

# Convert to vectors
def to_vector(tf_dict):
    return np.array([tf_dict.get(word, 0) for word in vocab])

doc_vectors = [to_vector(tf) for tf in doc_tf]
query_vector = to_vector(query_tf)

# Compute Cosine Similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = sqrt(np.dot(vec1, vec1))
    norm2 = sqrt(np.dot(vec2, vec2))
    return dot_product / (norm1 * norm2) if norm1 and norm2 else 0

# Compute similarities
similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_vectors]

# Display results
for i, score in enumerate(similarities):
    print(f"Similarity with Document {i+1}: {score:.4f}")

Similarity with Document 1: 0.0000
Similarity with Document 2: 0.0000
Similarity with Document 3: 0.0000
Similarity with Document 4: 0.0000
Similarity with Document 5: 0.0000
Similarity with Document 6: 0.0000
Similarity with Document 7: 0.0000
Similarity with Document 8: 0.0000
Similarity with Document 9: 0.0000
Similarity with Document 10: 0.0000
Similarity with Document 11: 0.0000
Similarity with Document 12: 0.0000
Similarity with Document 13: 0.0000
Similarity with Document 14: 0.0000
Similarity with Document 15: 0.0000
Similarity with Document 16: 0.0000
Similarity with Document 17: 0.0000
Similarity with Document 18: 0.0000
Similarity with Document 19: 0.0000
Similarity with Document 20: 0.0000
Similarity with Document 21: 0.0000
Similarity with Document 22: 0.0000
Similarity with Document 23: 0.0000
Similarity with Document 24: 0.0000
Similarity with Document 25: 0.0000
Similarity with Document 26: 0.0000
Similarity with Document 27: 0.0000
Similarity with Document 28: 0.0000
S