In [4]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from collections import defaultdict
from nltk.tokenize import word_tokenize  # Assuming NLTK for tokenization

def tokenize(text):
    """Tokenizes text into lowercase words, removing punctuation."""
    return [word.lower() for word in word_tokenize(text) if word.isalnum()]

def create_vocabulary_and_tf(data_path):
    """
    Reads data from a CSV file, creates vocabulary, and calculates TF for each document.

    Args:
        data_path (str): Path to the CSV file containing text data.

    Returns:
        tuple: (vocabulary, tf_representations)
            - vocabulary (dict): Maps terms to unique IDs.
            - tf_representations (list): List of dictionaries representing TF for each document.
    """

    vocabulary = defaultdict(int)  # Track term counts for vocabulary creation
    tf_representations = []

    # Read data from CSV file
    with open(data_path, 'r') as f:
        next(f)  # Skip header row
        for line in f:
            parts = line.strip().split(',')  # Split each line based on commas
            # Check if the line has enough parts
            if len(parts) < 4:
                continue  # Skip this line if it doesn't have enough parts

            # Extract text content from the desired column (e.g., SECTION_TEXT)
            text = parts[3]

            # Tokenize text and create TF representation for the document
            document_tf = {}
            for term in tokenize(text):
                term_id = vocabulary.get(term)  # Get existing ID or create new one
                if not term_id:
                    term_id = len(vocabulary)
                    vocabulary[term] = term_id
                document_tf[term_id] = document_tf.get(term_id, 0) + 1  # Count term frequency

            tf_representations.append(document_tf)

    return vocabulary, tf_representations


# File path to the CSV file
sample_file_path = "sample.csv"

# Call the function with the file path
vocabulary, tf_representations = create_vocabulary_and_tf(sample_file_path)

# Display the results
print("Vocabulary:", vocabulary)
print("Sample TF representations:")
for i, tf_representation in enumerate(tf_representations, start=1):
    print(f"Document {i}:", tf_representation)



Vocabulary: defaultdict(<class 'int'>, {'the': 9, 'game': 1, 'takes': 2, 'place': 3, 'in': 4, 'a': 5, 'future': 6, 'where': 7, 'after': 8, 'great': 9, 'nuclear': 10, 'war': 11, 'of': 12, '2015': 13, 'peace': 14, 'treaties': 15, 'were': 16, 'signed': 17, 'weapons': 18, 'destroyed': 19, 'and': 20, 'civilization': 21, 'was': 22, 'slowly': 23, 'rebuilt': 24, 'however': 25, '8': 26, 'years': 27, 'fleet': 28, 'alien': 29, 'ships': 30, 'appeared': 31, 'outside': 32, 'earth': 33, 'atmosphere': 34, 'several': 35, 'super': 36, 'fired': 37, 'from': 38, 'largest': 39, 'spacecraft': 40, 'resulting': 41, 'explosions': 42, 'causing': 43, 'major': 44, 'alterations': 45, 'to': 46, 'geography': 47, 'fearing': 48, 'for': 49, 'lives': 50, 'every': 51, 'human': 52, 'on': 53, 'united': 54, 'states': 55, 'government': 56, 'unleashed': 57, 'its': 58, 'hidden': 59, 'arsenal': 60, 'rained': 61, 'down': 62, 'onto': 63, 'planet': 64, 'buildings': 65, 'be': 66, 'air': 67, 'poisoned': 68, 'by': 69, 'radiation': 70,

IDF


In [6]:
import math

def create_vocabulary_and_tf(data_path):
    """
    Reads data from a CSV file, creates vocabulary, calculates TF for each document,
    and computes IDF for each term.

    Args:
        data_path (str): Path to the CSV file containing text data.

    Returns:
        tuple: (vocabulary, tf_representations, idf_values)
            - vocabulary (dict): Maps terms to unique IDs.
            - tf_representations (list): List of dictionaries representing TF for each document.
            - idf_values (dict): IDF values for each term in the vocabulary.
    """

    vocabulary = defaultdict(int)  # Track term counts for vocabulary creation
    tf_representations = []
    document_count = 0
    idf_values = {}

    # Read data from CSV file
    with open(data_path, 'r') as f:
        next(f)  # Skip header row
        for line in f:
            parts = line.strip().split(',')  # Split each line based on commas
            # Check if the line has enough parts
            if len(parts) < 4:
                continue  # Skip this line if it doesn't have enough parts

            # Extract text content from the desired column (e.g., SECTION_TEXT)
            text = parts[3]

            # Tokenize text and create TF representation for the document
            document_tf = {}
            for term in tokenize(text):
                term_id = vocabulary.get(term)  # Get existing ID or create new one
                if not term_id:
                    term_id = len(vocabulary)
                    vocabulary[term] = term_id
                    idf_values[term_id] = 0  # Initialize IDF value for new term
                document_tf[term_id] = document_tf.get(term_id, 0) + 1  # Count term frequency

            tf_representations.append(document_tf)
            document_count += 1

    # Calculate IDF for each term
    for term_id in idf_values:
        df = sum(1 for doc_tf in tf_representations if term_id in doc_tf)
        idf_values[term_id] = math.log(document_count / (df + 1))

    return vocabulary, tf_representations, idf_values

# File path to the CSV file
sample_file_path = "sample.csv"

# Call the function with the file path
vocabulary, tf_representations, idf_values = create_vocabulary_and_tf(sample_file_path)

# Display the results
print("Vocabulary:", vocabulary)
print("IDF values for terms in the vocabulary:")
for term, idf in idf_values.items():
    print(f"{term}: {idf}")

print("Sample TF representations:")
for i, doc_tf in enumerate(tf_representations, start=1):
    print(f"Document {i}: {doc_tf}")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1442: 6.214608098422191
1443: 6.214608098422191
1444: 6.214608098422191
1445: 6.214608098422191
1446: 6.214608098422191
1447: 6.214608098422191
1448: 6.214608098422191
1449: 5.521460917862246
1450: 5.809142990314028
1451: 6.214608098422191
1452: 5.115995809754082
1453: 5.809142990314028
1454: 6.214608098422191
1455: 6.214608098422191
1456: 6.214608098422191
1457: 6.214608098422191
1458: 6.214608098422191
1459: 5.521460917862246
1460: 6.214608098422191
1461: 5.521460917862246
1462: 5.298317366548036
1463: 5.809142990314028
1464: 6.214608098422191
1465: 5.521460917862246
1466: 5.809142990314028
1467: 4.268697949366879
1468: 6.214608098422191
1469: 5.521460917862246
1470: 4.710530701645918
1471: 6.214608098422191
1472: 6.214608098422191
1473: 5.809142990314028
1474: 5.521460917862246
1475: 6.214608098422191
1476: 5.298317366548036
1477: 5.809142990314028
1478: 6.214608098422191
1479: 6.214608098422191
1480: 6.214608098422191

In [7]:
def calculate_tf_idf(tf_representations, idf_values):
    """
    Calculates TF-IDF weights for each term in each document.

    Args:
        tf_representations (list): List of dictionaries representing TF for each document.
        idf_values (dict): Dictionary containing IDF values for each term in the vocabulary.

    Returns:
        list: List of dictionaries representing TF-IDF weights for each term in each document.
    """
    tf_idf_weights = []

    for document_tf in tf_representations:
        document_tf_idf = {}
        for term_id, tf in document_tf.items():
            idf = idf_values.get(term_id, 0.0)  # Get IDF value for the term
            tf_idf = tf * idf  # Calculate TF-IDF weight
            document_tf_idf[term_id] = tf_idf
        tf_idf_weights.append(document_tf_idf)

    return tf_idf_weights

# Calculate TF-IDF weights
tf_idf_weights = calculate_tf_idf(tf_representations, idf_values)

# Display TF-IDF weights for each document
for i, document_weights in enumerate(tf_idf_weights, start=1):
    print(f"Document {i} TF-IDF weights:")
    for term_id, weight in document_weights.items():
        print(f"Term ID: {term_id}, TF-IDF Weight: {weight}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Term ID: 1101, TF-IDF Weight: 5.521460917862246
Term ID: 247, TF-IDF Weight: 5.729408022295174
Term ID: 2333, TF-IDF Weight: 4.961845129926823
Term ID: 1765, TF-IDF Weight: 5.809142990314028
Term ID: 4021, TF-IDF Weight: 12.429216196844383
Term ID: 67, TF-IDF Weight: 5.298317366548036
Term ID: 4022, TF-IDF Weight: 6.214608098422191
Term ID: 22, TF-IDF Weight: 1.9805015938249324
Term ID: 4023, TF-IDF Weight: 5.521460917862246
Term ID: 46, TF-IDF Weight: 1.8263509139976741
Term ID: 581, TF-IDF Weight: 4.509860006183766
Term ID: 961, TF-IDF Weight: 4.961845129926823
Term ID: 788, TF-IDF Weight: 5.809142990314028
Term ID: 4024, TF-IDF Weight: 6.214608098422191
Term ID: 4025, TF-IDF Weight: 6.214608098422191
Document 636 TF-IDF weights:
Document 637 TF-IDF weights:
Document 638 TF-IDF weights:
Document 639 TF-IDF weights:
Document 640 TF-IDF weights:
Document 641 TF-IDF weights:
Document 642 TF-IDF weights:
Document 643 TF-IDF

In [8]:
import numpy as np

def tf_idf_to_vector(tf_idf_weights, vocabulary_size):
    """
    Transform TF-IDF representations into vector form for both documents and queries.

    Args:
        tf_idf_weights (list): List of dictionaries representing TF-IDF weights for each term in each document.
        vocabulary_size (int): Size of the vocabulary.

    Returns:
        numpy.ndarray: 2D array where each row represents a document or query as a vector of TF-IDF weights.
    """
    vectors = []
    for document_weights in tf_idf_weights:
        vector = np.zeros(vocabulary_size)
        for term_id, weight in document_weights.items():
            vector[term_id] = weight
        vectors.append(vector)
    return np.array(vectors)

# Determine the size of the vocabulary
vocabulary_size = len(vocabulary)

# Convert TF-IDF representations to vectors
document_vectors = tf_idf_to_vector(tf_idf_weights, vocabulary_size)

# Display the vectors
for i, vector in enumerate(document_vectors, start=1):
    print(f"Document {i} vector representation:")
    print(vector)


Document 1 vector representation:
[6.2146081  4.7105307  5.52146092 ... 0.         0.         0.        ]
Document 2 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 3 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 4 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 5 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 6 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 7 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 8 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 9 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 10 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 11 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 12 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 13 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 14 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 15 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 16 vector representation:
[0. 0. 0. ... 0. 0. 0.]
Document 17 vecto