<a href="https://colab.research.google.com/github/shashithenuwara/IRWA_Project/blob/project/IRWA_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing required libraries for PDF text extraction, tokenization, normalization, and other NLP tasks
!pip install PyPDF2 pdfplumber spacy nltk whoosh transformers torch beautifulsoup4
!python -m spacy download en_core_web_sm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m811.2 kB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)


**Data Collection**

In [3]:
# @title
import os
import re
import pdfplumber
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT
from transformers import pipeline, BertTokenizer, BertModel

from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy's English model for NER and lemmatization
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
# Path to the dataset
pdf_directory = '/content/drive/My Drive/Colab Notebooks/DataSets'

**Data Preprocessing**

In [8]:


# Extracting text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to clean the text
def clean_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r"[^a-zA-Z0-9\s,.!?'-]", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to preprocess text (tokenization, stopword removal, etc.)
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Function to perform stemming
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Function to perform lemmatization
def lemmatize_tokens(tokens):
    return [token.lemma_ for token in nlp(" ".join(tokens)) if token.is_alpha]

# Function to perform Named Entity Recognition (NER)
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Extract, clean, and preprocess text for each PDF
pdf_texts = {}
cleaned_texts = {}
preprocessed_texts = []
stemmed_texts = []
lemmatized_texts = []
entities_per_doc = []



In [9]:
# Process each PDF and apply preprocessing
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, filename)

        # Extract and clean text
        text = extract_text_from_pdf(pdf_path)
        cleaned = clean_text(text)
        tokens = preprocess_text(cleaned)

        # Apply stemming, lemmatization, and NER
        stemmed = stem_tokens(tokens)
        lemmatized = lemmatize_tokens(tokens)
        entities = perform_ner(cleaned)

        # Store results
        pdf_texts[filename] = text
        cleaned_texts[filename] = cleaned
        preprocessed_texts.append(tokens)
        stemmed_texts.append(stemmed)
        lemmatized_texts.append(lemmatized)
        entities_per_doc.append(entities)

# Print sample outputs for validation
print("Sample cleaned text:", list(cleaned_texts.values())[0])
print("Sample tokenized text:", preprocessed_texts[0])
print("Sample stemmed text:", stemmed_texts[0])
print("Sample lemmatized text:", lemmatized_texts[0])
print("Sample NER:", entities_per_doc[0])


Sample cleaned text: Cell Reports Resource A Comprehensive Nuclear Receptor Network for Breast Cancer Cells RalfKittler,1,9,8JieZhou,1,8SujunHua,1,8,10LijiaMa,1YuwenLiu,1ElishaPendleton,1ChaoCheng,2,3MarkGerstein,2,3,4 andKevinP.White1,5,6,7, 1InstituteofGenomicsandSystemsBiology,ArgonneNationalLaboratoryandTheUniversityofChicago,Chicago,IL60637,USA 2DepartmentofMolecularBiophysicsandBiochemistry 3PrograminComputationalBiologyandBioinformatics 4DepartmentofComputerScience YaleUniversity,NewHaven,CT06520,USA 5DepartmentofHumanGenetics 6DepartmentofEcologyandEvolution 7DepartmentofMedicine TheUniversityofChicago,Chicago,IL60637,USA 8Theseauthorscontributedequallytothiswork 9PresentaddressEugeneMcDermottCenterofHumanGrowthandDevelopment,TheUniversityofTexasSouthwesternMedicalCenter, Dallas,TX75235,USA 10PresentaddressTheUniversityofTexasMDAndersonCancerCenter,Houston,TX77030,USA Correspondencekpwhiteigsb.org httpdx.doi.org10.1016j.celrep.2013.01.004 SUMMARY poordrugtargets,withthenotablee

In [10]:
print("Files in the directory:", os.listdir(pdf_directory))

Files in the directory: ['A-Comprehensive-Nuclear-Receptor-Network-for-Breas.pdf', 'Artificial-intelligence-in-breast-cancer-diagnosti.pdf', 'Aurora-kinase-A-regulates-cancer-associated-RNA-ab.pdf', 'Autophagy-Suppresses-Breast-Cancer-Metastasis_devc.pdf', 'BikDD-Eliminates-Breast-Cancer-Initiating-Cells-an.pdf', 'Biological-differences-between-normal-and-cancer-a.pdf', 'Breast-Cancer-Polygenic-Risk-Score-and-Contralater.pdf', 'Breast-Cancer-Src-Activity--Bad-to-the-Bone_ccell.pdf', 'Breast-Cancer-Stem-Cells--Eradication-by-Different.pdf', 'Causal-relationship-between-dietary-factors-and-br.pdf', 'CDK7-Dependent-Transcriptional-Addiction-in-Triple.pdf', 'CHEK2-1100delC-and-Susceptibility-to-Breast-Cancer.pdf', 'Common-Breast-Cancer-Predisposition-Alleles-Are-As.pdf', 'Cordycepin-enhances-anti-tumor-immunity-in-breast-.pdf', 'Deciphering-breast-cancer--from-biology-to-the-cli.pdf', 'Direct-Transcriptional-Consequences-of-Somatic-Mut.pdf', '-em-Comprehensive-Analysis-of-the-Expressionand

Indexing with Search features (using Whoosh)

In [11]:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
import os

# Define schema for Whoosh indexing
schema = Schema(
    title=TEXT(stored=True),  # Store document title
    content=TEXT(stored=True),  # Store the original content
    stemmed_text=TEXT(stored=True),  # Store stemmed text
    lemmatized_text=TEXT(stored=True),  # Store lemmatized text
    entities=TEXT(stored=True)  # Store named entities
)

# Create the index directory if it doesn't exist
if not os.path.exists("whoosh_index"):
    os.mkdir("whoosh_index")

# Create the index
index = create_in("whoosh_index", schema)

# Index the documents
writer = index.writer()

for filename, original_text, stemmed, lemmatized, entities in zip(
    pdf_texts.keys(), pdf_texts.values(), stemmed_texts, lemmatized_texts, entities_per_doc):

    # Add each document to the index
    writer.add_document(
        title=filename,
        content=original_text,
        stemmed_text=" ".join(stemmed),
        lemmatized_text=" ".join(lemmatized),
        entities=" ".join([ent[0] for ent in entities])
    )

# Commit the changes
writer.commit()
print("Indexing completed!")


Indexing completed!


In [12]:
from whoosh.qparser import QueryParser

# Open the index for searching
index = open_dir("whoosh_index")

# Basic content search
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse("cancer treatment")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Asiaticoside-inhibits-breast-cancer-progression-an.pdf
Title: Cordycepin-enhances-anti-tumor-immunity-in-breast-.pdf
Title: GABA-baclofen-stabilizes-PD-L1-and-enhances-immuno.pdf
Title: Datopotamab-deruxtecan--A-novel-antibody-drug-conj.pdf
Title: A-comparison-between-clinical-decision-support-sys.pdf
Title: BikDD-Eliminates-Breast-Cancer-Initiating-Cells-an.pdf
Title: Impacts-of-designed-vanillic-acid-polymer-magnetic.pdf
Title: A-bibliometric-study-of-the-intellectual-base-and-.pdf
Title: The-Expression-of-ZNF268-and-Its-Role-in-The-Cispl.pdf
Title: Endophytic-fungi--A-future-prospect-for-breast-can.pdf


**NLP Techniques**

Fuzzy Search

In [13]:
# Fuzzy search with edit distance
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse("cancr~1")  # Fuzzy search for 'cancer'
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Entity Recognition

In [14]:
# Search in lemmatized text
with index.searcher() as searcher:
    query = QueryParser("lemmatized_text", index.schema).parse("cancer treatment")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")

# Search in entities field
with index.searcher() as searcher:
    query = QueryParser("entities", index.schema).parse("cancer")
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Asiaticoside-inhibits-breast-cancer-progression-an.pdf
Title: Cordycepin-enhances-anti-tumor-immunity-in-breast-.pdf
Title: BikDD-Eliminates-Breast-Cancer-Initiating-Cells-an.pdf
Title: GABA-baclofen-stabilizes-PD-L1-and-enhances-immuno.pdf
Title: Datopotamab-deruxtecan--A-novel-antibody-drug-conj.pdf
Title: The-Expression-of-ZNF268-and-Its-Role-in-The-Cispl.pdf
Title: A-comparison-between-clinical-decision-support-sys.pdf
Title: Emerging-treatments-in-HER2-positive-advanced-brea.pdf
Title: Metabolic-Imaging-Detects-Resistance-to-PI3K&#x3b1.pdf
Title: A-bibliometric-study-of-the-intellectual-base-and-.pdf
Title: Exploring-the-impact-of-breast-cancer-on-colonizat.pdf
Title: Estrogens-and-the-risk-of-breast-cancer--A-narrati.pdf
Title: Oncolytic-Adenoviruses-Kill-Breast-Cancer-Initiati.pdf
Title: Biological-differences-between-normal-and-cancer-a.pdf
Title: Breast-cancer-susceptibility&#x2014;A-new-look-at-.pdf
Title: GLS-and-GOT2-as-prognostic-biomarkers-associated-w.pdf
Title: R

Query Expansion

In [15]:
from nltk.corpus import wordnet

# Query expansion using WordNet
def expand_query(query):
    synonyms = set()
    for syn in wordnet.synsets(query):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return " OR ".join(synonyms)

# Expanded query for 'cancer'
expanded_query = expand_query("cancer")

# Search using the expanded query
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse(expanded_query)
    results = searcher.search(query)

    for result in results:
        print(f"Title: {result['title']}")


Title: Estrogens-and-the-risk-of-breast-cancer--A-narrati.pdf
Title: Causal-relationship-between-dietary-factors-and-br.pdf
Title: Characterization-of-the-tumor-microenvironment-in-.pdf
Title: Review--Predictive-approaches-to-breast-cancer-ris.pdf
Title: Exploring-the-impact-of-breast-cancer-on-colonizat.pdf
Title: MiR-338&#x2013;5p,-a-novel-metastasis-related-miRN.pdf
Title: A-bioinformatic-analysis-found-low-expression-and-.pdf
Title: Cytokines-and-cell-adhesion-molecules-exhibit-dist.pdf
Title: Molecular-subtype-identification-and-prognosis-str.pdf
Title: ncRNAs-mediated-overexpression-of--em-TET3--em--pr.pdf


In [16]:
# Install the necessary libraries
!pip install transformers torch scikit-learn



Semantic Search

In [17]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-v1.1")

# Function to get the BERT embedding for a given text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():  # Disable gradient computation for speed
        outputs = model(**inputs)
    # Return the mean of the token embeddings (average pooling)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Squeeze to remove extra dimension

# Example query for semantic search
query = "cancer treatment"
query_embedding = get_embedding(query)

# Assuming 'pdf_texts' is a dictionary where keys are document filenames and values are their extracted content.
# Compute embeddings for each document in the corpus
document_embeddings = [get_embedding(text) for text in pdf_texts.values()]

# Calculate cosine similarity between the query embedding and each document embedding
similarities = cosine_similarity([query_embedding], document_embeddings)[0]

# Rank the documents by similarity (highest similarity first)
top_indices = similarities.argsort()[-5:][::-1]  # Top 5 most relevant documents

# Print the filenames of the top documents based on similarity
print("Top 5 relevant documents based on semantic search:")
for idx in top_indices:
    print(f"Document: {list(pdf_texts.keys())[idx]} - Similarity Score: {similarities[idx]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Top 5 relevant documents based on semantic search:
Document: Pregnancy-and-breast-cancer--The-other-side-of-the.pdf - Similarity Score: 0.6887888312339783
Document: Focus-on-breast-cancer_ccell.pdf - Similarity Score: 0.6822837591171265
Document: A-new-key-in-breast-cancer-metastasis_ccell.pdf - Similarity Score: 0.6794993877410889
Document: Periostin-binding-DNA-Aptamer-Inhibits-Breast-Canc.pdf - Similarity Score: 0.6724396347999573
Document: Photo-sono-activated-BNT@MoS-sub-2--sub--composite.pdf - Similarity Score: 0.6704385280609131


In [18]:
!pip install flask flask_sqlalchemy


Collecting flask_sqlalchemy
  Downloading flask_sqlalchemy-3.1.1-py3-none-any.whl.metadata (3.4 kB)
Downloading flask_sqlalchemy-3.1.1-py3-none-any.whl (25 kB)
Installing collected packages: flask_sqlalchemy
Successfully installed flask_sqlalchemy-3.1.1


In [19]:
import flask
import flask_sqlalchemy


In [20]:
from flask import Flask, jsonify

app = Flask(__name__)

@app.route('/')
def home():
    return jsonify({'message': 'Hello, World!'}), 200


In [22]:
!pip install pyngrok # use to host flask as colab doesn't allow direct hosting




In [None]:
!ngrok config add-authtoken 2mbYowCbzDDA47omhyfn7mfDSnU_7RKJtcLWCysu7WCAV5Whj # removed the <> around the token as this is not required and may be causing the issue
from pyngrok import ngrok

# Create a tunnel for the Flask app
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")

# Run the Flask app (make sure you have defined the Flask app 'app' in a previous cell)
app.run(port=5000)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Public URL: NgrokTunnel: "https://27dd-34-23-225-66.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
