In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from gensim.models import FastText

# Load your dataset
# Replace 'your_dataset.csv' with your actual file path
dataset = pd.read_excel('IJJD dataset.xlsx')  # Ensure your dataset has a column named 'texts'
texts = dataset['texts'].tolist()  # Extract texts as a list

# Initialize an empty DataFrame to store embeddings
embeddings_df = pd.DataFrame()

# 1. FastText Embeddings
def get_fasttext_embeddings(texts):
    print("Generating FastText embeddings...")
    # Load pre-trained FastText model for Indic languages
    fasttext_model = FastText.load('path_to_fasttext_model.bin')  # Update path with your FastText model file

    embeddings = []
    for text in texts:
        tokens = text.split()  # Tokenize text
        word_vectors = [fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))  # Mean pooling for sentence embedding
        else:
            embeddings.append(np.zeros(fasttext_model.vector_size))  # Handle OOV
    return np.array(embeddings)

# 2. IndicBERT or MuRIL Embeddings
def get_transformer_embeddings(texts, model_name):
    print(f"Generating embeddings using {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embeddings = []
    for text in texts:
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Mean pooling
        embeddings.append(sentence_embedding)
    return np.array(embeddings)

# Generate embeddings
# 1. FastText
fasttext_embeddings = get_fasttext_embeddings(texts)
fasttext_df = pd.DataFrame(fasttext_embeddings, columns=[f'fasttext_dim_{i}' for i in range(fasttext_embeddings.shape[1])])

# 2. IndicBERT
indicbert_embeddings = get_transformer_embeddings(texts, 'ai4bharat/indic-bert')
indicbert_df = pd.DataFrame(indicbert_embeddings, columns=[f'indicbert_dim_{i}' for i in range(indicbert_embeddings.shape[1])])

# 3. MuRIL
muril_embeddings = get_transformer_embeddings(texts, 'google/muril-base-cased')
muril_df = pd.DataFrame(muril_embeddings, columns=[f'muril_dim_{i}' for i in range(muril_embeddings.shape[1])])

# Combine all embeddings into a single DataFrame
embeddings_df = pd.concat([fasttext_df, indicbert_df, muril_df], axis=1)

# Save to CSV
output_file = 'text_embeddings.csv'
embeddings_df.to_csv(output_file, index=False)
print(f"Embeddings saved to {output_file}")


ImportError: huggingface-hub>=0.19.3,<1.0 is required for a normal functioning of this module, but found huggingface-hub==0.17.0.
Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main