AutoTokenizer

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

def get_embeddings(texts, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    max_length = tokenizer.model_max_length
    print("Model Max length", max_length)

    all_embeddings = []

    for text in texts:
        # Tokenize and split text into chunks
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=False)
        input_ids = inputs['input_ids'][0]

        # Create chunks of max_length
        chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]

        chunk_embeddings = []

        for chunk in chunks:
            chunk_input = {'input_ids': chunk.unsqueeze(0)}
            with torch.no_grad():
                outputs = model(**chunk_input)
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
            chunk_embeddings.append(chunk_embedding)

        # Combine chunk embeddings
        combined_embedding = np.mean(chunk_embeddings, axis=0)
        all_embeddings.append(combined_embedding)

    return np.vstack(all_embeddings)

if __name__ == "__main__":
    # Example usage
    texts = ["This is a long text that needs to be tokenized and embedded without truncation."]
    embeddings = get_embeddings(texts)
    print("Embeddings shape:", embeddings.shape)


  from .autonotebook import tqdm as notebook_tqdm


Model Max length 512
Embeddings shape: (1, 768)


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

def get_embeddings(texts, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    max_length = tokenizer.model_max_length

    all_embeddings = []
    all_decoded_texts = []
    print("Model Max length", max_length)


    for text in texts:
        # Tokenize and split text into chunks
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=False)
        input_ids = inputs['input_ids'][0]

        # Create chunks of max_length
        chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]

        chunk_embeddings = []
        chunk_texts = []

        for chunk in chunks:
            chunk_input = {'input_ids': chunk.unsqueeze(0)}
            with torch.no_grad():
                outputs = model(**chunk_input)
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
            chunk_embeddings.append(chunk_embedding)

            # Decode the chunk back to text
            decoded_chunk = tokenizer.decode(chunk, skip_special_tokens=True)
            chunk_texts.append(decoded_chunk)

        # Combine chunk embeddings
        combined_embedding = np.mean(chunk_embeddings, axis=0)
        all_embeddings.append(combined_embedding)
        all_decoded_texts.append(' '.join(chunk_texts))

    return np.vstack(all_embeddings), all_decoded_texts

if __name__ == "__main__":
    # Example usage
    texts = ["This is a long text that needs to be tokenized and embedded without truncation."]
    embeddings, decoded_texts = get_embeddings(texts)
    print("Embeddings shape:", embeddings.shape)
    print("Decoded text:", decoded_texts)


Model Max length 512
Embeddings shape: (1, 768)
Decoded text: ['this is a long text that needs to be tokenized and embedded without truncation.']


Encoding Function

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

def encode_text(texts, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    max_length = tokenizer.model_max_length

    all_embeddings = []
    all_token_ids = []

    print("Model Max length:", max_length)

    for text in texts:
        # Tokenize and split text into chunks
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=False)
        input_ids = inputs['input_ids'][0]

        # Create chunks of max_length
        chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
        print(chunks)

        chunk_embeddings = []

        for chunk in chunks:
            chunk_input = {'input_ids': chunk.unsqueeze(0)}
            with torch.no_grad():
                outputs = model(**chunk_input)
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
            chunk_embeddings.append(chunk_embedding)

            all_token_ids.append(chunk.numpy())

        # Combine chunk embeddings
        combined_embedding = np.mean(chunk_embeddings, axis=0)
        all_embeddings.append(combined_embedding)

    return np.vstack(all_embeddings), all_token_ids

if __name__ == "__main__":
    # Example usage
    texts = ["""This is a long text that needs to be tokenized and embedded without truncation.     EvalAI: Towards Better Evaluation Systems for AI Agents
      

       Fabrik: An Online Collaborative Neural Network Editor
      

       Do explanation modalities make VQA models more predictable to a human?
      

       Evaluating Visual Conversational Agents via Cooperative Human-AI Games
      

       It Takes Two to Tango: Towards Theory of AI's Mind"""]
    embeddings, token_ids = encode_text(texts)
    print("Embeddings shape:", embeddings.shape)
    print("Embeddings:", embeddings)
    print("Token IDs", token_ids)


Model Max length: 512
[tensor([  101,  2023,  2003,  1037,  2146,  3793,  2008,  3791,  2000,  2022,
        19204,  3550,  1998, 11157,  2302, 19817,  4609, 10719,  1012,  9345,
        19771,  1024,  2875,  2488,  9312,  3001,  2005,  9932,  6074,  6904,
        23736,  2243,  1024,  2019,  3784, 12317, 15756,  2897,  3559,  2079,
         7526, 16913, 11475,  7368,  2191,  1058, 19062,  4275,  2062, 21425,
         2000,  1037,  2529,  1029, 23208,  5107,  4512,  2389,  6074,  3081,
        10791,  2529,  1011,  9932,  2399,  2009,  3138,  2048,  2000, 17609,
         1024,  2875,  3399,  1997,  9932,  1005,  1055,  2568,   102])]
Embeddings shape: (1, 768)
Embeddings: [[-2.89555043e-01  5.51846884e-02  1.08932123e-01 -4.20793444e-02
   1.14392862e-01  5.98948859e-02  2.37213328e-01  3.37278917e-02
   6.56338260e-02 -2.67120004e-01  6.07445017e-02 -5.81723712e-02
  -2.59414941e-01 -3.88526917e-02  9.47184116e-02  4.02795434e-01
   5.88367647e-03 -1.34157985e-02 -2.71942079e-01  2.05

Decoding Function

In [10]:
from transformers import AutoTokenizer

def decode_text(token_ids, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_decoded_texts = []

    for chunk_ids in token_ids:
        decoded_chunk = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        all_decoded_texts.append(decoded_chunk)

    return ' '.join(all_decoded_texts)


decoded_text = decode_text(token_ids)
print("decoded tezt: \n", decoded_text)

decoded tezt: 
 this is a long text that needs to be tokenized and embedded without truncation. evalai : towards better evaluation systems for ai agents fabrik : an online collaborative neural network editor do explanation modalities make vqa models more predictable to a human? evaluating visual conversational agents via cooperative human - ai games it takes two to tango : towards theory of ai's mind


#Sentence Transformer

In [11]:
from sentence_transformers import SentenceTransformer

def encode_texts(texts, model_name='all-MiniLM-L6-v2'):
    """
    Encode a list of texts using a sentence transformer model.
    
    Args:
        texts (list of str): The texts to encode.
        model_name (str): The name of the sentence transformer model to use.
        
    Returns:
        numpy.ndarray: The embeddings of the texts.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings


In [17]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"*100]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(len(embeddings))


2


In [18]:
for k in embeddings:
    print(len(k))

384
384


In [None]:
# text_chunking.py
import os
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter

# Extract text (You can integrate your HTML/PDF extraction logic here)
def extract_text(file_path):
    # Dummy implementation
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Chunk text using LangChain
def chunk_text(text):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    return chunks

# def chunk_text(text, chunk_size=1000, chunk_overlap=200):
#     chunks = []
#     for i in range(0, len(text), chunk_size - chunk_overlap):
#         chunks.append(text[i:i + chunk_size])
#     return chunks

# Get embeddings using the updated function

# Save chunks and embeddings to CSV
def save_to_csv(file_path, chunks, embeddings):
    data = {
        "Index": list(range(len(chunks))),
        "Text": chunks,
        "Embeddings": [emb.tolist() for emb in embeddings]
    }
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)

if __name__ == "__main__":
    # Path to files
    file_paths = ["path/to/your/file1.txt", "path/to/your/file2.txt"]

    for file_path in file_paths:
        text = extract_text(file_path)
        chunks = chunk_text(text)
        embeddings, decoded_texts = get_embeddings(chunks)
        save_to_csv("chunks_and_embeddings.csv", chunks, embeddings)
        print(f"Chunks and embeddings from {file_path} saved to CSV")


In [1]:
#Summary profile text processin


In [16]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter

class TextProcessor:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    
    def extract_text(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    
    def encode_texts(self, texts):
        embeddings = self.model.encode(texts, convert_to_tensor=True)
        return embeddings
    
    def save_to_csv(self, file_path, texts, embeddings):
        data = {
            "Index": list(range(len(texts))),
            "Text": texts,
            "Embeddings": [emb.tolist() for emb in embeddings]
        }
        df = pd.DataFrame(data)
        df.to_csv(file_path, index=False)
    
    def process_and_save_text(self, master_text, output_csv):
        chunks = self.text_splitter.split_text(master_text)
        embeddings = self.encode_texts(chunks)
        self.save_to_csv(output_csv, chunks, embeddings)
        print(f"Chunks and embeddings saved to {output_csv}")
    
    def process_directory(self, directory, output_csv):
        all_texts = []
        all_embeddings = []
        master_text = ''
        self.text_splitter = CharacterTextSplitter(chunk_size=300, 
                                                   chunk_overlap=100,
                                                #    separator=" ")
                                                   separator="\n")

        for root, _, files in os.walk(directory):
            for file in files:
                if file.endswith("summary.txt"):
                    file_path = os.path.join(root, file)
                    text = self.extract_text(file_path)
                    print(f"Processing {file_path}...")

                    master_text +=f'\n {text} \n'

        self.process_and_save_text(master_text, output_csv)
        print(f"All chunks and embeddings saved to {output_csv}")
        return master_text


In [17]:
textprocessor = TextProcessor()
user_dir = "rohithprofile/"
output_csv = os.path.join(user_dir, "summary_profile.csv")
print("Processing all text files to create chunks and embeddings...")
textprocessor.process_directory(user_dir, output_csv)
print(f"Database is created at {output_csv}")

Processing all text files to create chunks and embeddings...
Processing rohithprofile/user_profile_summary.txt...
Chunks and embeddings saved to rohithprofile/summary_profile.csv
All chunks and embeddings saved to rohithprofile/summary_profile.csv
Database is created at rohithprofile/summary_profile.csv


Environment variable set.
Environment variable value: some_value
