In [1]:
!pip install transformers
!pip install sentence-transformers
!pip install torch
!pip install pandas
!pip install nltk



In [2]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cuda


In [3]:
import pandas as pd
df = pd.read_csv('collection_with_abstracts.csv')
df = df.drop_duplicates(subset=['PMID'])
df = df.dropna(subset=['Title', 'Abstract'])
df['Text'] = df['Title'] + ' ' + df['Abstract']
print(f'Total number of papers after combining and preprocessing: {len(df)}')


Total number of papers after combining and preprocessing: 11237


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
reference_sentences = [
    "This paper uses neural network techniques in virology or epidemiology.",
    "This study applies deep learning methods to virology or epidemiology.",
    "This research involves computer vision applications in virology or epidemiology.",
    "This paper focuses on natural language processing in virology or epidemiology.",
    "This study explores generative AI models in virology or epidemiology.",
    "This research uses transformer models in virology or epidemiology.",
    "This paper discusses large language models in virology or epidemiology.",
    "This study examines multimodal models in virology or epidemiology."
]

ref_embeddings = model.encode(reference_sentences, convert_to_tensor=True, device=device)



In [6]:
paper_embeddings = model.encode(df['Text'].tolist(), convert_to_tensor=True, device=device)

In [7]:
similarity_scores = cosine_similarity(
    paper_embeddings.cpu().numpy(), ref_embeddings.cpu().numpy()
)

max_similarity_scores = similarity_scores.max(axis=1)



In [8]:
# Define a threshold for similarity (adjust based on experimentation)
threshold = 0.55
relevant_indices = np.where(max_similarity_scores >= threshold)[0]
filtered_df = df.iloc[relevant_indices].reset_index(drop=True)
print(f'Number of relevant papers after semantic filtering: {len(filtered_df)}')


Number of relevant papers after semantic filtering: 211


In [9]:
from transformers import pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="valhalla/distilbart-mnli-12-3",
    device= device
)


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [10]:
candidate_labels = ["text mining", "computer vision"]
def classify_methods(texts):
    results = classifier(texts, candidate_labels, multi_label=True)
    classifications = []
    for result in results:
        scores = dict(zip(result['labels'], result['scores']))
        text_mining_score = scores.get('text mining', 0)
        computer_vision_score = scores.get('computer vision', 0)
        threshold = 0.5
        is_text_mining = text_mining_score >= threshold
        is_computer_vision = computer_vision_score >= threshold
        if is_text_mining and is_computer_vision:
            classifications.append('both')
        elif is_text_mining:
            classifications.append('text mining')
        elif is_computer_vision:
            classifications.append('computer vision')
        else:
            classifications.append('other')
    return classifications


In [11]:
batch_size = 32
method_classifications = []

for i in range(0, len(filtered_df), batch_size):
    batch_texts = filtered_df['Text'].iloc[i:i+batch_size].tolist()
    batch_classifications = classify_methods(batch_texts)
    method_classifications.extend(batch_classifications)

filtered_df['Method Classification'] = method_classifications

print(filtered_df['Method Classification'].value_counts())


Method Classification
both               128
other               53
text mining         20
computer vision     10
Name: count, dtype: int64


In [12]:
deep_learning_methods = [
    'neural network', 'artificial neural network', 'machine learning model',
    'feedforward neural network', 'multilayer perceptron', 'convolutional neural network',
    'recurrent neural network', 'long short-term memory network', 'cnn', 'grnn', 'rnn', 'lstm',
    'deep learning', 'deep neural network', 'transformer', 'transformer model',
    'transformer architecture', 'attention mechanism', 'self-attention',
    'sequence-to-sequence model', 'encoder-decoder', 'bert', 'gpt', 'gpt-2', 'gpt-3',
    'bert model', 'gpt model', 'generative adversarial network', 'gan', 'variational autoencoder',
    'vae', 'autoencoder', 'deep belief network', 'dbn', 'resnet', 'inception', 'mobilenet',
    'efficientnet', 'unet', 'yolo', 'faster r-cnn', 'mask r-cnn', 'ssd', 'capsule network',
    'deep reinforcement learning', 'policy gradient', 'q-learning', 'deep q-network', 'dqn',
    'multimodal model', 'diffusion model', 'transformer-based model', 'pretrained language model',
    'large language model', 'llm', 'state-of-the-art language model', 'deep convolutional network',
    'deep residual network', 'graph neural network', 'gnn', 'graph convolutional network', 'gcn',
    'natural language processing', 'nlp', 'computer vision', 'image recognition',
    'speech recognition', 'attention-based neural network', 'attention mechanism',
    'self-supervised learning', 'unsupervised learning', 'semi-supervised learning',
    'supervised learning', 'reinforcement learning', 'meta-learning', 'transfer learning',
    'few-shot learning', 'zero-shot learning', 'generative model', 'deep generative model'
]

In [13]:
import re


method_patterns = [re.escape(method.lower()) for method in deep_learning_methods]

def extract_deep_learning_methods(text):
    text_lower = text.lower()
    methods_found = set()
    for pattern in method_patterns:
        if re.search(r'\b' + pattern + r'\b', text_lower):
            index = method_patterns.index(pattern)
            methods_found.add(deep_learning_methods[index])
    return ', '.join(methods_found) if methods_found else 'Not specified'



In [14]:
filtered_df['Method Name'] = filtered_df['Text'].apply(extract_deep_learning_methods)
result_df = filtered_df[['PMID', 'Title', 'Authors', 'Journal/Book', 'Publication Year',
                         'Method Classification', 'Method Name', 'Abstract']]

In [15]:
print(result_df.head())


       PMID                                              Title  \
0  39013794  Deep Learning - Methods to Amplify Epidemiolog...   
1  38454859  The scope of artificial intelligence in retino...   
2  33880950  Global evolution of research on pulmonary nodu...   
3  33457181            Deep Learning applications for COVID-19   
4  33328047  Prediction of systemic biomarkers from retinal...   

                                             Authors         Journal/Book  \
0  Alex Quistberg D, Mooney SJ, Tasdizen T, Arbel...       Am J Epidemiol   
1           Maitra P, Shah PK, Campbell PJ, Rishi P.  Indian J Ophthalmol   
2  Li N, Wang L, Hu Y, Han W, Zheng F, Song W, Ji...         Future Oncol   
3               Shorten C, Khoshgoftaar TM, Furht B.           J Big Data   
4  Rim TH, Lee G, Kim Y, Tham YC, Lee CJ, Baik SJ...  Lancet Digit Health   

   Publication Year Method Classification  \
0              2024                  both   
1              2024                  both   
2    

In [16]:
result_df.to_csv('filtered_papers.csv', index=False)

