<a href="https://colab.research.google.com/github/shashithenuwara/IRWA_Project/blob/project/IRWA_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing required libraries for PDF text extraction, tokenization, normalization, and other NLP tasks
!pip install PyPDF2 pdfplumber spacy nltk whoosh transformers torch beautifulsoup4
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


**Data Collection**

In [None]:
# @title
import os
import re
import pdfplumber
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT
from transformers import pipeline, BertTokenizer, BertModel

from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy's English model for NER and lemmatization
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Path to the dataset
pdf_directory = '/content/drive/My Drive/Colab_Notebooks/DataSets'

**Data Preprocessing**

In [None]:


# Extracting text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to clean the text
def clean_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r"[^a-zA-Z0-9\s,.!?'-]", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to preprocess text (tokenization, stopword removal, etc.)
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Function to perform stemming
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Function to perform lemmatization
def lemmatize_tokens(tokens):
    return [token.lemma_ for token in nlp(" ".join(tokens)) if token.is_alpha]

# Function to perform Named Entity Recognition (NER)
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Extract, clean, and preprocess text for each PDF
pdf_texts = {}
cleaned_texts = {}
preprocessed_texts = []
stemmed_texts = []
lemmatized_texts = []
entities_per_doc = []



In [None]:
# Process each PDF and apply preprocessing
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, filename)

        # Extract and clean text
        text = extract_text_from_pdf(pdf_path)
        cleaned = clean_text(text)
        tokens = preprocess_text(cleaned)

        # Apply stemming, lemmatization, and NER
        stemmed = stem_tokens(tokens)
        lemmatized = lemmatize_tokens(tokens)
        entities = perform_ner(cleaned)

        # Store results
        pdf_texts[filename] = text
        cleaned_texts[filename] = cleaned
        preprocessed_texts.append(tokens)
        stemmed_texts.append(stemmed)
        lemmatized_texts.append(lemmatized)
        entities_per_doc.append(entities)

# Print sample outputs for validation
print("Sample cleaned text:", list(cleaned_texts.values())[0])
print("Sample tokenized text:", preprocessed_texts[0])
print("Sample stemmed text:", stemmed_texts[0])
print("Sample lemmatized text:", lemmatized_texts[0])
print("Sample NER:", entities_per_doc[0])


Sample cleaned text: Original Article Comprehensive Analysis of the Expression and Prognosis for TDO2 in Breast Cancer Qiang Liu,1,3 Jie Zhai,1,2,3 Xiangyi Kong,1,3 Xiangyu Wang,1 Zhongzhao Wang,1,4 Yi Fang,1,4 and Jing Wang1,4 1DepartmentofBreastSurgicalOncology,NationalCancerCenterNationalClinicalResearchCenterforCancerCancerHospital,ChineseAcademyofMedicalSciences andPekingUnionMedical College,Beijing 100021,PeoplesRepublic ofChina2Department ofTranslationalMolecularPathology,MDAndersonCancer Center, UniversityofTexas,Houston,TX77030,USA Aplethoraofpreviousstudieshavebeenfocusedontheroleof Programmedcelldeath1PD1isaprominentimmunecheckpoint indoleamine 2,3-dioxygenase 1 IDO1 in cancer immunity reportedtoinduceanimmunosuppressiveeffectintumorsbyinter- however,thealternativewayoftargetingtryptophan2,3-diox- acting with programmed death ligand 1 PD-L1.5,6 Indeed, PD-L1 ygenase TDO2 in cancer immunotherapy has been largely was overexpressed in TNBC compared with other BC subtypes,5,6 ig

In [None]:
print("Files in the directory:", os.listdir(pdf_directory))

Files in the directory: ['-em-Comprehensive-Analysis-of-the-Expressionand-Pr.pdf', '-em-ATM--em--Heterozygous-Germline-Mutations-Contr.pdf', '-em-AKTIP--em--loss-is-enriched-in-ER&#x3b1;-posit.pdf', 'A-new-key-in-breast-cancer-metastasis_ccell.pdf', 'A-prognostic-nomogram-for-predicting-breast-cancer.pdf', '-em-JARID1B--em--Is-a-Luminal-Lineage-Driving-Onco.pdf', '27-Hydroxycholesterol-Promotes-Cell-Autonomous,-ER.pdf', 'A-joint-transcriptome-wide-association-study-acros.pdf', '-em-N--em--acetylcysteine-overcomes--em-NF1--em--l.pdf', 'A-Comprehensive-Nuclear-Receptor-Network-for-Breas.pdf', 'A-bioinformatic-analysis-found-low-expression-and-.pdf', 'A-bibliometric-study-of-the-intellectual-base-and-.pdf', 'A-Pin1-Mutant-p53-Axis-Promotes-Aggressiveness-in&.pdf', 'A-collection-of-breast-cancer-cell-lines-for-the-s.pdf', 'A-comparison-between-clinical-decision-support-sys.pdf', '-em-NEAT1--em--is-essential-for-metabolic-changes-.pdf', 'A-novel-oncogenic-enhancer-of-estrogen-receptor-po.pd

Indexing with Search features (using Whoosh)

In [None]:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
import os

# Define schema for Whoosh indexing
schema = Schema(
    title=TEXT(stored=True),  # Store document title
    content=TEXT(stored=True),  # Store the original content
    stemmed_text=TEXT(stored=True),  # Store stemmed text
    lemmatized_text=TEXT(stored=True),  # Store lemmatized text
    entities=TEXT(stored=True)  # Store named entities
)

# Create the index directory if it doesn't exist
if not os.path.exists("whoosh_index"):
    os.mkdir("whoosh_index")

# Create the index
index = create_in("whoosh_index", schema)

# Index the documents
writer = index.writer()

for filename, original_text, stemmed, lemmatized, entities in zip(
    pdf_texts.keys(), pdf_texts.values(), stemmed_texts, lemmatized_texts, entities_per_doc):

    # Add each document to the index
    writer.add_document(
        title=filename,
        content=original_text,
        stemmed_text=" ".join(stemmed),
        lemmatized_text=" ".join(lemmatized),
        entities=" ".join([ent[0] for ent in entities])
    )

# Commit the changes
writer.commit()
print("Indexing completed!")


Indexing completed!


In [None]:
from whoosh.qparser import QueryParser

# Open the index for searching
index = open_dir("whoosh_index")

# Basic content search
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse("cancer treatment")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Asiaticoside-inhibits-breast-cancer-progression-an.pdf
Title: Cordycepin-enhances-anti-tumor-immunity-in-breast-.pdf
Title: GABA-baclofen-stabilizes-PD-L1-and-enhances-immuno.pdf
Title: Datopotamab-deruxtecan--A-novel-antibody-drug-conj.pdf
Title: A-comparison-between-clinical-decision-support-sys.pdf
Title: BikDD-Eliminates-Breast-Cancer-Initiating-Cells-an.pdf
Title: Impacts-of-designed-vanillic-acid-polymer-magnetic.pdf
Title: A-bibliometric-study-of-the-intellectual-base-and-.pdf
Title: The-Expression-of-ZNF268-and-Its-Role-in-The-Cispl.pdf
Title: Endophytic-fungi--A-future-prospect-for-breast-can.pdf


**NLP Techniques**

Fuzzy Search

In [None]:
# Fuzzy search with edit distance
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse("cancr~1")  # Fuzzy search for 'cancer'
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Entity Recognition

In [None]:
# Search in lemmatized text
with index.searcher() as searcher:
    query = QueryParser("lemmatized_text", index.schema).parse("cancer treatment")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")

# Search in entities field
with index.searcher() as searcher:
    query = QueryParser("entities", index.schema).parse("cancer")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Asiaticoside-inhibits-breast-cancer-progression-an.pdf
Title: Cordycepin-enhances-anti-tumor-immunity-in-breast-.pdf
Title: BikDD-Eliminates-Breast-Cancer-Initiating-Cells-an.pdf
Title: GABA-baclofen-stabilizes-PD-L1-and-enhances-immuno.pdf
Title: Datopotamab-deruxtecan--A-novel-antibody-drug-conj.pdf
Title: The-Expression-of-ZNF268-and-Its-Role-in-The-Cispl.pdf
Title: A-comparison-between-clinical-decision-support-sys.pdf
Title: Emerging-treatments-in-HER2-positive-advanced-brea.pdf
Title: Metabolic-Imaging-Detects-Resistance-to-PI3K&#x3b1.pdf
Title: A-bibliometric-study-of-the-intellectual-base-and-.pdf
Title: Exploring-the-impact-of-breast-cancer-on-colonizat.pdf
Title: Estrogens-and-the-risk-of-breast-cancer--A-narrati.pdf
Title: Oncolytic-Adenoviruses-Kill-Breast-Cancer-Initiati.pdf
Title: Biological-differences-between-normal-and-cancer-a.pdf
Title: Breast-cancer-susceptibility&#x2014;A-new-look-at-.pdf
Title: GLS-and-GOT2-as-prognostic-biomarkers-associated-w.pdf
Title: R

Query Expansion

In [None]:
from nltk.corpus import wordnet

# Query expansion using WordNet
def expand_query(query):
    synonyms = set()
    for syn in wordnet.synsets(query):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return " OR ".join(synonyms)

# Expanded query for 'cancer'
expanded_query = expand_query("cancer")

# Search using the expanded query
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse(expanded_query)
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Estrogens-and-the-risk-of-breast-cancer--A-narrati.pdf
Title: Causal-relationship-between-dietary-factors-and-br.pdf
Title: Characterization-of-the-tumor-microenvironment-in-.pdf
Title: Review--Predictive-approaches-to-breast-cancer-ris.pdf
Title: Exploring-the-impact-of-breast-cancer-on-colonizat.pdf
Title: MiR-338&#x2013;5p,-a-novel-metastasis-related-miRN.pdf
Title: A-bioinformatic-analysis-found-low-expression-and-.pdf
Title: Cytokines-and-cell-adhesion-molecules-exhibit-dist.pdf
Title: Molecular-subtype-identification-and-prognosis-str.pdf
Title: ncRNAs-mediated-overexpression-of--em-TET3--em--pr.pdf


In [None]:
# Install the necessary libraries
!pip install transformers torch scikit-learn



Semantic Search

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-v1.1")

# Function to get the BERT embedding for a given text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():  # Disable gradient computation for speed
        outputs = model(**inputs)
    # Return the mean of the token embeddings (average pooling)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Squeeze to remove extra dimension

# Example query for semantic search
query = "cancer treatment"
query_embedding = get_embedding(query)

# Assuming 'pdf_texts' is a dictionary where keys are document filenames and values are their extracted content.
# Compute embeddings for each document in the corpus
document_embeddings = [get_embedding(text) for text in pdf_texts.values()]

# Calculate cosine similarity between the query embedding and each document embedding
similarities = cosine_similarity([query_embedding], document_embeddings)[0]

# Rank the documents by similarity (highest similarity first)
top_indices = similarities.argsort()[-5:][::-1]  # Top 5 most relevant documents

# Print the filenames of the top documents based on similarity
print("Top 5 relevant documents based on semantic search:")
for idx in top_indices:
    print(f"Document: {list(pdf_texts.keys())[idx]} - Similarity Score: {similarities[idx]}")



Top 5 relevant documents based on semantic search:
Document: Pregnancy-and-breast-cancer--The-other-side-of-the.pdf - Similarity Score: 0.6887888312339783
Document: Focus-on-breast-cancer_ccell.pdf - Similarity Score: 0.6822837591171265
Document: A-new-key-in-breast-cancer-metastasis_ccell.pdf - Similarity Score: 0.6794993877410889
Document: Periostin-binding-DNA-Aptamer-Inhibits-Breast-Canc.pdf - Similarity Score: 0.6724398136138916
Document: Photo-sono-activated-BNT@MoS-sub-2--sub--composite.pdf - Similarity Score: 0.6704385280609131
