<a href="https://colab.research.google.com/github/shfarhaan/ml-notebooks/blob/main/Cholera_Data_cleaning_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install torch transformers
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
#!pip install transformers torch pandas

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import requests
import time
import random
import csv
import datetime
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
import os

import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from google.colab import drive
import scispacy
import spacy
import string


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#load disease_data csv file

from google.colab import drive
drive.mount('/content/gdrive')

df = pd.read_csv('/content/gdrive/MyDrive/Omdena/NLP in drug prediction/all_merged_datasets_v2.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
###Data cleaning

# Select all text columns
text_columns = ["Disease name", "Symptoms", "Source", "Treatment", "Diagnosis", "Drugs name"]
df[text_columns] = df[text_columns].astype(str).fillna('')

# 1) Clean Text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuationw
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df[text_columns] = df[text_columns].applymap(clean_text)



### Delete duplicates according to Disease name, symptoms and source
# Group by 'Disease name' and 'Source', selecting the row with the most abundant Symptoms data while keeping all columns
df = df.groupby(['Disease name', 'Source'], group_keys=False).apply(
    lambda group: group.loc[group['Symptoms'].str.len().idxmax()] if group['Symptoms'].notna().any() else group.iloc[0]
).reset_index(drop=True)

  df[text_columns] = df[text_columns].applymap(clean_text)
  df = df.groupby(['Disease name', 'Source'], group_keys=False).apply(


In [None]:
# 2) Extract Medical Entities using SciSpaCy
nlp = spacy.load("en_ner_bc5cdr_md")
# Function to extract specific medical entities
def extract_entities(text, entity_type):
    if pd.isna(text):  # Handle missing values
        return ""

    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == entity_type]
    return ", ".join(entities) if entities else ""

# Apply entity extraction to each column with the specified entity type
df["Symptoms_SciSpaCy"] = df["Symptoms"].apply(lambda x: extract_entities(x, "DISEASE"))
df["Diagnosis_SciSpaCy"] = df["Diagnosis"].apply(lambda x: extract_entities(x, "DISEASE"))
df["Drugs_SciSpaCy"] = df["Drugs name"].apply(lambda x: extract_entities(x, "CHEMICAL"))
df["Treatment_SciSpaCy"] = df["Treatment"].apply(lambda x: extract_entities(x, "TREATMENT"))


In [None]:
# 3) Extract Medical Entities using bioBert
# ✅ Use a fine-tuned BioBERT model trained on the BC5CDR dataset (for drug NER)
#model_name = "d4data/biobert_cased_ner_bc5cdr"
model_name = "alvaroalon2/biobert_chemical_ner"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a Named Entity Recognition (NER) pipeline
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract named entities
def extract_entities(text):
    if isinstance(text, str):  # Ensure text is a string
        ner_results = nlp_ner(text)
        entities = [(entity['word'], entity['entity_group'], entity['score']) for entity in ner_results]
        return entities
    return None

# Apply function to each row in 'Treatment' column to get drug name
df["extracted_Treatment_BioBERT"] = df["Treatment"].apply(extract_entities)

Device set to use cpu


In [None]:
df.to_csv('/content/gdrive/MyDrive/Omdena/NLP in drug prediction/all_merged_datasets_v3.csv', index = False)

### Code ends here

In [None]:
# 3) Extract Medical Entities using bioBert

# Load BioBERT NER models
ner_disease = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", grouped_entities=True)  # For Symptoms & Diagnosis
ner_drugs = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", grouped_entities=True)  # For Drugs
ner_treatment = pipeline("ner", model="michiyasunaga/BioLinkBERT-base", grouped_entities=True)  # For Treatments



from transformers import AutoTokenizer

# Load tokenizer for truncation
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def extract_entities(text, model, target_types):
    """
    Extracts named entities from text using a specified BioBERT model.

    Parameters:
    - text (str): The input text.
    - model (transformers.pipeline): The NER model to use.
    - target_types (list): The entity labels to extract.

    Returns:
    - str: A comma-separated string of extracted entities.
    """
    if pd.isna(text) or text.strip() == "":
        return ""

    # ✅ Truncate text to first 512 tokens to prevent errors
    tokenized_text = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    truncated_text = tokenizer.decode(tokenized_text["input_ids"][0], skip_special_tokens=True)

    # Process truncated text
    entities = model(truncated_text)
    filtered_entities = [ent['word'] for ent in entities if ent['entity_group'] in target_types]

    return ", ".join(filtered_entities) if filtered_entities else ""



# Ensure no NaN values
df.fillna("", inplace=True)

# Apply models to the correct columns
df["Symptoms_Extracted"] = df["Symptoms"].apply(lambda x: extract_entities(x, ner_disease, ["DISEASE"]))
df["Diagnosis_Extracted"] = df["Diagnosis"].apply(lambda x: extract_entities(x, ner_disease, ["DISEASE"]))
df["Drugs_Extracted"] = df["Drugs name"].apply(lambda x: extract_entities(x, ner_drugs, ["CHEMICAL"]))
df["Treatment_Extracted"] = df["Treatment"].apply(lambda x: extract_entities(x, ner_treatment, ["TREATMENT"]))


