In [1]:
# Import necessary libraries
import re
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
import numpy as np
import nltk
import os

In [2]:
# Download necessary NLTK resources (only the first time)
nltk.download('punkt')
nltk.download('punkt_tab')

# Upload files to Colab
# from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


In [3]:
# The path to file on Google Drive
def get_file_path(file_name):
    base_path = '/content/drive/MyDrive/summarization/'
    file_path = os.path.join(base_path, file_name)

    if os.path.isfile(file_path):
        print(f"{file_name} located successfully!")
    else:
        print(f"{file_name} not found. Please check the path.")
    return file_path

In [4]:
# Step 1: Preprocess the CSV File
def preprocess_csv(file_path):
    """
    Load CSV file and preprocess the 'content' column.
    """
    df = pd.read_csv(file_path)
    sentences = []

    for content in tqdm(df['content'].dropna(), desc="Tokenizing CSV", unit="row"):
        content = content.strip().lower()  # Lowercase and strip whitespace
        content = re.sub(r'[^a-z0-9\s]', '', content)   # Remove non-alphanumeric characters (keeping spaces)
        content = re.sub(r'\s+', ' ', content).strip()    # Remove non-alphanumeric characters (keeping spaces)
        tokens = nltk.word_tokenize(content)  # Tokenize the content using NLTK
        sentences.append(tokens)

    return sentences

In [5]:
# # Step 1: Preprocess the CSV File
# def preprocess_csv(file_path):
#     """
#     Load CSV file and preprocess the 'content' column.
#     """
#     # Define function for separating concatenated words
#     def separate_concatenated_words(text):
#         text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
#         text = re.sub(r'(?<=\w)(?=[A-Z])', ' ', text)
#         text = re.sub(r'(?<=\w)(?=\d)', ' ', text)
#         text = re.sub(r'(?<=\d)(?=\D)', ' ', text)
#         return text

#     df = pd.read_csv(file_path)
#     sentences = []

#     # Iterate through each row in the 'content' column, removing NaN values
#     for content in tqdm(df['content'].dropna(), desc="Tokenizing CSV", unit="row"):
#         content = content.lower().strip()  # Lowercase and strip whitespace
#         processed_lines = []

#         # Split the content by lines
#         for line in content.splitlines():
#             # Remove non-ASCII characters
#             line = re.sub(r'[^\x00-\x7F]+', '', line)
#             # Remove numbered list periods (e.g., 1.)
#             line = re.sub(r'(\d+)\.', r'\1', line)
#             # Split lines into sentences
#             sentences_in_line = re.split(r'(?<=[.!?]) +', line)

#             for sentence in sentences_in_line:
#                 # Remove special characters while keeping alphanumeric characters, periods, and commas
#                 sentence = re.sub(r'[^a-zA-Z0-9\s.,]', '', sentence)  # Allow alphanumeric, spaces, periods, and commas
#                 if sentence.strip():
#                     # Separate concatenated words
#                     sentence = separate_concatenated_words(sentence)
#                     # Ensure the word "I" is capitalized
#                     sentence = re.sub(r'\bi\b', 'I', sentence)
#                     # Append the cleaned sentence to the list
#                     processed_lines.append(sentence.strip())

#         # Join the processed lines into a single string
#         processed_text = "\n".join(processed_lines)

#         # Tokenization: split by whitespace, which keeps punctuation attached
#         tokens = re.findall(r'\S+[,\.]?', processed_text)  # This regex will keep periods and commas attached
#         sentences.append(tokens)

#     return sentences

In [6]:
# Step 2: Split the Sentences into Training and Validation Sets
def split_data(sentences, test_size=0.25):
    """
    Split the tokenized sentences into training and validation sets.
    """
    train_sentences, val_sentences = train_test_split(sentences, test_size=test_size, random_state=42)
    return train_sentences, val_sentences

In [7]:
# Step 3: Build the Vocabulary from the Training Data
def build_vocab(sentences):
    """
    Build the vocabulary from a list of tokenized sentences.
    """
    counter = Counter()
    for sentence in tqdm(sentences, desc="Building Vocabulary", unit="sentence"):
        counter.update(sentence)  # Count word frequencies
    vocab = {word: i for i, (word, _) in enumerate(counter.items(), start=1)}  # Create vocabulary dictionary
    vocab["<unk>"] = 0  # Add an unknown token for unseen words
    return vocab

In [8]:
# Step 4: Create GloVe Embeddings Using gensim and Align with the Vocabulary
def create_glove_embeddings(vocab, glove_path, dim=100):
    """
    Create GloVe embeddings and align them with the custom vocabulary.
    """
    from gensim.models import KeyedVectors
    # Load pre-trained GloVe vectors using gensim's KeyedVectors
    glove_vectors = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)

    # Initialize embedding matrix for the vocabulary
    embedding_matrix = np.zeros((len(vocab), dim))

    # Fill the embedding matrix with pre-trained GloVe vectors
    for word, idx in tqdm(vocab.items(), desc="Aligning Embeddings", unit="word"):
        if word in glove_vectors:  # Check if the word exists in GloVe
            embedding_matrix[idx] = glove_vectors[word]  # Use the pre-trained vector
        else:
            embedding_matrix[idx] = np.random.randn(dim)  # Random vector for out-of-vocab words

    return embedding_matrix

In [9]:
# Step 5: Save the Embeddings to vectors.txt
def save_embeddings(vocab, embedding_matrix, output_file):
    """
    Save the trained embeddings to a text file.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for word, idx in vocab.items():
            vector_str = ' '.join([f"{val:.6f}" for val in embedding_matrix[idx].tolist()])
            f.write(f"{word} {vector_str}\n")

In [10]:
# Main Function to Execute the Entire Workflow
def main(csv_file, glove_file, output_file, embedding_dim=100, test_size=0.25):
    print("Step 1: Preprocessing CSV and Tokenizing Content...")
    sentences = preprocess_csv(csv_file)  # Read and preprocess the CSV file

    print("Step 2: Splitting into Training and Validation sets...")
    train_sentences, val_sentences = split_data(sentences, test_size=test_size)

    print("Step 3: Building Vocabulary from Training Data...")
    vocab = build_vocab(train_sentences)  # Build the vocabulary from training sentences

    print(f"Step 4: Creating {embedding_dim}-dimensional GloVe Embeddings using gensim...")
    embedding_matrix = create_glove_embeddings(vocab, glove_file, dim=embedding_dim)  # Create GloVe embeddings

    print(f"Step 5: Saving Embeddings to {output_file}...")
    save_embeddings(vocab, embedding_matrix, output_file)  # Save the embeddings to vectors.txt

    # Optional: You can evaluate or print some embeddings from the validation set
    print("\nValidation Sample Embedding Check:")
    for sentence in val_sentences[:5]:  # Print first 5 validation sentences and their vocab indices
        print("Sentence:", sentence)
        print("Indices:", [vocab.get(word, vocab["<unk>"]) for word in sentence])

    print("GloVe Training, Splitting, and Saving Complete!")

    return train_sentences, val_sentences  # Return val_sentences for further use

# Specify file paths and run
csv_file = "/content/drive/MyDrive/summarization/800rows_training.csv"  # Path to your CSV file with the 'content' column
glove_file = "/content/drive/MyDrive/summarization/glove.6B.100d.txt"  # Path to pre-trained GloVe file
output_file = "/content/drive/MyDrive/summarization/600rows100d_training.txt"  # Output file for embeddings

# Check if the GloVe file exists before running
if not os.path.isfile(glove_file):
    print(f"GloVe file '{glove_file}' not found. Please download and place it in the same directory.")
else:
    train_sentences, val_sentences = main(csv_file, glove_file, output_file, embedding_dim=100, test_size=0.25)  # Capture the validation sentences

Step 1: Preprocessing CSV and Tokenizing Content...


Tokenizing CSV: 100%|██████████| 800/800 [00:14<00:00, 55.40row/s]


Step 2: Splitting into Training and Validation sets...
Step 3: Building Vocabulary from Training Data...


Building Vocabulary: 100%|██████████| 600/600 [00:00<00:00, 1151.38sentence/s]


Step 4: Creating 100-dimensional GloVe Embeddings using gensim...


Aligning Embeddings: 100%|██████████| 29480/29480 [00:00<00:00, 180642.91word/s]


Step 5: Saving Embeddings to /content/drive/MyDrive/summarization/600rows100d_training.txt...

Validation Sample Embedding Check:
Sentence: ['binance', 'terms', 'of', 'uselast', 'revised', '13', 'january', '2021', 'these', 'binance', 'terms', 'of', 'use', 'is', 'entered', 'into', 'between', 'you', 'hereinafter', 'referred', 'to', 'as', 'you', 'or', 'your', 'and', 'binance', 'operators', 'as', 'defined', 'below', 'by', 'accessing', 'downloading', 'using', 'or', 'clicking', 'on', 'i', 'agree', 'to', 'accept', 'any', 'binance', 'services', 'as', 'defined', 'below', 'provided', 'by', 'binance', 'as', 'defined', 'below', 'you', 'agree', 'that', 'you', 'have', 'read', 'understood', 'and', 'accepted', 'all', 'of', 'the', 'terms', 'and', 'conditions', 'stipulated', 'in', 'these', 'terms', 'of', 'use', 'hereinafter', 'referred', 'to', 'as', 'these', 'terms', 'as', 'well', 'as', 'our', 'privacy', 'policy', 'at', 'nbsp', 'httpswwwbinancecomenprivacyin', 'addition', 'when', 'using', 'some', 'featu

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Example: loading GloVe embeddings and creating vocab
def load_glove_embeddings(file_path):
    vocab = {}
    embedding_matrix = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            vocab[word] = idx
            embedding_matrix.append(vector)

    # Convert the list to a NumPy array
    embedding_matrix = np.array(embedding_matrix)
    vocab["<unk>"] = len(vocab)  # Add unknown token
    embedding_matrix = np.vstack([embedding_matrix, np.zeros((1, embedding_matrix.shape[1]))])  # Add zero vector for <unk>

    return vocab, embedding_matrix

# Load the embeddings (replace with your file path)
file_path = '/content/drive/MyDrive/summarization/simple-720rows_training.txt'  # Update this with your GloVe file path
vocab, embedding_matrix = load_glove_embeddings(file_path)

def check_cosine_similarity(word1, word2, vocab, embedding_matrix):
    """
    Compute the cosine similarity between two words based on the GloVe embeddings.
    """
    idx1 = vocab.get(word1, vocab["<unk>"])
    idx2 = vocab.get(word2, vocab["<unk>"])

    vec1 = embedding_matrix[idx1].reshape(1, -1)
    vec2 = embedding_matrix[idx2].reshape(1, -1)

    similarity = cosine_similarity(vec1, vec2)[0][0]  # Get cosine similarity
    return similarity

# Example usage:
word1, word2 = 'obligated', 'obligation'
similarity = check_cosine_similarity(word1, word2, vocab, embedding_matrix)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")


Cosine similarity between 'obligated' and 'obligation': 0.6690


In [12]:
def check_vocab_coverage(sentences, vocab):
    """
    Check how many words in the validation set are covered by the vocabulary.
    """
    total_words = 0
    unknown_words = 0

    for sentence in sentences:
        for word in sentence:
            total_words += 1
            if word not in vocab:
                unknown_words += 1

    coverage = (total_words - unknown_words) / total_words * 100
    return coverage

# Example usage:
coverage = check_vocab_coverage(val_sentences, vocab)
print(f"Vocabulary coverage for validation set: {coverage:.2f}%")


Vocabulary coverage for validation set: 98.92%


In [13]:
def compare_word_distributions(train_sentences, val_sentences):
    """
    Compare word frequency distributions between training and validation sets.
    """
    train_counter = Counter()
    val_counter = Counter()

    for sentence in train_sentences:
        train_counter.update(sentence)

    for sentence in val_sentences:
        val_counter.update(sentence)

    print("Most common words in Training Set:")
    print(train_counter.most_common(10))

    print("\nMost common words in Validation Set:")
    print(val_counter.most_common(10))

# Example usage:
compare_word_distributions(train_sentences, val_sentences)


Most common words in Training Set:
[('the', 93383), ('or', 70241), ('to', 63098), ('of', 60362), ('and', 54568), ('you', 39620), ('any', 39339), ('in', 25941), ('your', 22334), ('that', 20451)]

Most common words in Validation Set:
[('the', 31991), ('or', 23517), ('of', 20978), ('to', 20781), ('and', 19059), ('any', 13233), ('you', 12760), ('in', 8857), ('your', 7312), ('that', 6898)]
