In [None]:
!pip install transformers
!pip install torch
!pip install pandas

In [None]:
pip install together



In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
data = pd.read_csv('/content/cleaned_ml_papers.csv')

In [None]:
# load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# generate BERT embeddings for each summary
def generate_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# extract top N keywords using TF-IDF ranking with n-grams
def extract_keywords_using_tfidf(text, num_keywords=6, ngram_range=(1, 3)):
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
    tfidf_matrix = tfidf.fit_transform([text])
    feature_names = np.array(tfidf.get_feature_names_out())
    sorted_indices = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]  # Sort by importance
    top_keywords = feature_names[sorted_indices][:num_keywords]
    return ' | '.join(top_keywords)

#  empty list to store keywords
keywords_list = []

# iterate through the dataset and generate keywords for each row
for index, row in data.iterrows():
    summary = row['summary']
    embeddings = generate_bert_embeddings(summary)

    # keywords using TF-IDF with n-grams
    keywords = extract_keywords_using_tfidf(summary)

    # append the generated keywords to the list
    keywords_list.append(keywords)

data['keywords_using_bert'] = keywords_list

data.to_csv('/content/cleaned_ml_papers_with_bert.csv', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import pandas as pd
from together import Together

data = pd.read_csv('/content/cleaned_ml_papers_with_bert.csv')

# initialize Together API client
client = Together(api_key="INSERT_API_KEY")

# get keywords using Together AI API
def get_keywords_from_together(summary):
    # prompt for keyword extraction
    prompt = f"Extract only up to 6 keywords from the following summary in a pipe-separated format, with no additional text: '{summary}'"

    # send the request to the model for keyword extraction
    stream = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )

    # process the streamed response to get the keywords
    keywords = ""
    for chunk in stream:
        keywords += (chunk.choices[0].delta.content or "")

    return keywords.strip()

data['keywords_using_togetherai'] = data['summary'].apply(get_keywords_from_together)

data.to_csv('/content/cleaned_ml_papers_with_bert_and_togetherai.csv', index=False)

print("Keywords successfully added and file saved as 'cleaned_ml_papers_with_bert_and_togetherai.csv'")

Keywords successfully added and file saved as 'cleaned_ml_papers_with_bert_and_togetherai.csv'
