### To identify and classify biological entities such as genes, proteins, diseases, and drugs from text 

In [2]:
# Example dictionaries of keywords (simplified for demonstration)
genes = ["BRCA1", "TP53", "EGFR", "APOE"]
proteins = ["hemoglobin", "collagen", "myosin", "insulin"]
diseases = ["cancer", "diabetes", "alzheimer", "influenza"]
drugs = ["aspirin", "metformin", "insulin", "ibuprofen"]

import re

def classify_entities(text):
    found_genes = []
    found_proteins = []
    found_diseases = []
    found_drugs = []
    
    # Match genes
    for gene in genes:
        if re.search(r'\b' + gene + r'\b', text, re.IGNORECASE):
            found_genes.append(gene)
    
    # Match proteins
    for protein in proteins:
        if re.search(r'\b' + protein + r'\b', text, re.IGNORECASE):
            found_proteins.append(protein)
    
    # Match diseases
    for disease in diseases:
        if re.search(r'\b' + disease + r'\b', text, re.IGNORECASE):
            found_diseases.append(disease)
    
    # Match drugs
    for drug in drugs:
        if re.search(r'\b' + drug + r'\b', text, re.IGNORECASE):
            found_drugs.append(drug)
    
    return {
        "Genes": found_genes,
        "Proteins": found_proteins,
        "Diseases": found_diseases,
        "Drugs": found_drugs
    }

# Example input text
text = """
BRCA1 and TP53 are important genes in cancer research.
Hemoglobin levels are crucial in diagnosing anemia.
Metformin is commonly used to treat diabetes.
"""

# Classify entities in the text
entities = classify_entities(text)

# Display the results
print("Identified Entities:")
for category, items in entities.items():
    print(f"{category}: {', '.join(items) if items else 'None'}")


Identified Entities:
Genes: BRCA1, TP53
Proteins: hemoglobin
Diseases: cancer, diabetes
Drugs: metformin


### Extracting gene-disease relationships from text 

In [3]:
import re

# Example dictionaries of keywords (simplified for demonstration)
genes = ["BRCA1", "TP53", "EGFR", "APOE"]
diseases = ["cancer", "diabetes", "alzheimer", "influenza"]

relationship_patterns = [
    r'\bcauses\b', 
    r'\blinked to\b', 
    r'\bassociated with\b', 
    r'\bincreases the risk of\b'
]



def extract_gene_disease_relationships(text):
    relationships = []
    
    for gene in genes:
        for disease in diseases:
            for pattern in relationship_patterns:
                # Construct the regex to find the pattern
                regex = fr'{gene}.*{pattern}.*{disease}|{disease}.*{pattern}.*{gene}'
                match = re.search(regex, text, re.IGNORECASE)
                
                if match:
                    relationships.append((gene, disease, match.group()))
    
    return relationships


# Example input text
text = """
BRCA1 is strongly associated with breast cancer and ovarian cancer.
Mutations in the TP53 gene can cause a variety of cancers.
EGFR mutations have been linked to non-small cell lung cancer.
The APOE gene is associated with Alzheimer's disease.
"""

# Extract relationships
relationships = extract_gene_disease_relationships(text)

# Display the results
print("Extracted Gene-Disease Relationships:")
for gene, disease, sentence in relationships:
    print(f"Gene: {gene}, Disease: {disease}, Context: {sentence}")


Extracted Gene-Disease Relationships:
Gene: BRCA1, Disease: cancer, Context: BRCA1 is strongly associated with breast cancer and ovarian cancer
Gene: EGFR, Disease: cancer, Context: EGFR mutations have been linked to non-small cell lung cancer
Gene: APOE, Disease: alzheimer, Context: APOE gene is associated with Alzheimer


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Example data
texts = ["BRCA1 is associated with breast cancer.", "The influenza virus causes the flu."]
labels = ["Cancer", "Virus"]

# Create a model pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(texts, labels)

# Predict the category of new text
new_text = "TP53 mutation is linked with several cancers."
prediction = model.predict([new_text])
print(f"Predicted category: {prediction[0]}")


Predicted category: Cancer


### Text Mining for Gene-Disease Associations. Identify associations between genes and diseases from scientific literature.

In [16]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from Bio import Entrez
from Bio import Medline

# Define stopwords
stop_words = set(stopwords.words('english'))

# Example sentence
sentence = "Mutations in the BRCA1 gene are associated with an increased risk of breast cancer."

# Tokenize and remove stopwords
tokens = word_tokenize(sentence)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)


['Mutations', 'BRCA1', 'gene', 'associated', 'increased', 'risk', 'breast', 'cancer', '.']


In [9]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Sample clinical note
note = "The patient shows signs of improvement but is still experiencing severe pain."

# Analyze sentiment
sentiment = sia.polarity_scores(note)
print(sentiment)


{'neg': 0.416, 'neu': 0.477, 'pos': 0.106, 'compound': -0.7814}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Sudeep\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Biomedical Literature Summarization-  Automatically generate summaries of research papers or clinical trial reports.

In [3]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization")

# Sample text (abstract of a paper)
text = """
Microbiome data analysis and its interpretation into meaningful biological insights remain very challenging for
numerous reasons, perhaps most prominently, due to the need to account for multiple factors, including
collinearity, sparsity (excessive zeros) and effect size, that the complex experimental workflow and subsequent
downstream data analysis require. Moreover, a meaningful microbiome data analysis necessitates the development
of interpretable models that incorporate inferences across available data as well as background biomedical
knowledge. We developed a multimodal framework that considers sparsity (excessive zeros), lower effect size,
intrinsically microbial correlations, i.e., collinearity, as well as background biomedical knowledge in the form of
a cluster-infused enriched network architecture. Finally, our framework also provides a candidate taxa/Operational
Taxonomic Unit (OTU) that can be targeted for future validation experiments.
"""

# Summarize the text
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)

print(summary[0]['summary_text'])


No model was supplied, defaulted to google-t5/t5-small and revision d769bba (https://huggingface.co/google-t5/t5-small).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


a meaningful microbiome data analysis necessitates the development of interpretable models that incorporate inferences across available data as well as background biomedical knowledge .


In [2]:
from textblob import TextBlob

# Example text
text = "The study found a significant increase in survival rates with the new drug."

# Perform sentiment analysis
blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Sentiment: {sentiment}")


Sentiment: Sentiment(polarity=0.2556818181818182, subjectivity=0.6647727272727273)
