In [43]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [139]:
# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
#     text = text.translate(str.maketrans('', '', string.punctuation.replace('.', '').replace('!', '').replace('?', '')))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    return ' '.join(tokens)


In [140]:
def parse_pico_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            if '|' in line:
                name, label, text = line.strip().split('|', 2)
                
                if label in ['P', 'I', 'C', 'O']:  # Filter only PICO elements
#                     clean_text = text
                    clean_text = preprocess_text(text)
#                     if label == 'R':
#                         label = 'O'
                    data.append({'label': label, 'text': clean_text})
    return pd.DataFrame(data)

# Load the dataset
df = parse_pico_dataset('.\\PICO\\data\\splitted\\PICO_train_relabelled.txt')

In [141]:
df.iloc[0]['text']

'people crc considered inclusion trial @ year old , diagnosed primary crc recovery period postsurgery ( could still receiving adjuvant therapy ) .'

In [142]:
df.head()

Unnamed: 0,label,text
0,P,people crc considered inclusion trial @ year o...
1,P,@ % ( n = @ ) eligible crc survivor consented ...
2,P,"@ crc survivor , @ people cardiovascular disea..."
3,I,referral postsurgical crc survivor weekly cr e...
4,I,class included crc survivor people cvd .


In [127]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load Bio_ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")




In [143]:
# Helper function to chunk text
def chunk_text(text, max_length=512, overlap=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), max_length - overlap):
        chunk = tokens[i : i + max_length]
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
    return chunks

# Helper function to get embeddings for a single text
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding (pooled output for sentence-level embeddings)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)
    return cls_embedding

In [144]:
# Description
description = """
Types of studies: We will include individually- and cluster-randomized controlled trials (RCTs). We will include both parallel and cross-over trials, where only data from the first intervention phase, prior to cross-over, will be used.
"""

# Chunk the description
chunks = chunk_text(description)

# Get embeddings for all description chunks
chunk_embeddings = [get_embedding(chunk, tokenizer, model) for chunk in chunks]

# Texts to compare
texts = df[df['label']=='P']['text'].values[:10]

In [145]:
# Compare each text against all description chunks
results = {}
for text in texts:
    text_embedding = get_embedding(text, tokenizer, model)
    scores = []
    for chunk_embedding in chunk_embeddings:
        similarity = cosine_similarity(
            text_embedding.unsqueeze(0), chunk_embedding.unsqueeze(0)
        ).item()
        scores.append(similarity)
    results[text] = scores

In [146]:
# Display Results
threshold = 0.7  # Adjust threshold for sensitivity
for text, scores in results.items():
    print(f"Text: {text}")
    for i, score in enumerate(scores):
        print(f"  Chunk {i + 1} Similarity Score: {score:.2f}")
    if any(score >= threshold for score in scores):
        print("Overall Match: Yes (matches at least one chunk)")
    else:
        print("Overall Match: No (no matches found)")
    print("-" * 50)

Text: people crc considered inclusion trial @ year old , diagnosed primary crc recovery period postsurgery ( could still receiving adjuvant therapy ) .
  Chunk 1 Similarity Score: 0.76
Overall Match: Yes (matches at least one chunk)
--------------------------------------------------
Text: @ % ( n = @ ) eligible crc survivor consented participate trial .
  Chunk 1 Similarity Score: 0.82
Overall Match: Yes (matches at least one chunk)
--------------------------------------------------
Text: @ crc survivor , @ people cardiovascular disease ( cvd ) , @ crc nurse @ cr clinician participated qualitative study .
  Chunk 1 Similarity Score: 0.81
Overall Match: Yes (matches at least one chunk)
--------------------------------------------------
Text: seventy-four subject ( mean age = @ year , sd = @ , range = @ ) spinal cord injury resulting motor loss ( @ tetraplegia @ paraplegia ) studied .
  Chunk 1 Similarity Score: 0.81
Overall Match: Yes (matches at least one chunk)
-----------------------