In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import glob
import os

# Path where your CSV files are located in the NLP folder on Google Drive
path = '/content/drive/My Drive/NLP'
all_files = glob.glob(os.path.join(path, "*.csv"))  # get all csv files in the directory

with open('/content/drive/My Drive/NLP/output.txt', 'w') as file:  # open the output file in write mode
    for filename in all_files:
        df = pd.read_csv(filename)  # read the CSV file

        # If the file is 'CSV1.csv', rename the 'SHORT-TEXT' column to 'TEXT'
        if 'SHORT-TEXT' in df.columns:
            df.rename(columns={'SHORT-TEXT': 'TEXT'}, inplace=True)

        # Write the 'TEXT' column to the output file
        file.write(df['TEXT'].str.cat(sep=' '))


In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz
!pip install transformers

Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl.metadata (16 kB)
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pybind11<2.6.2 (from nmslib>=1.7.3.6->scispacy)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl.metadata (8.7 kB)
Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━

In [None]:
import pandas as pd
import re
from collections import Counter
from google.colab import files

# Load the combined text from the output file
with open('/content/drive/My Drive/NLP/output.txt', 'r') as file:
    text = file.read()

# Find all words in the text
words = re.findall(r'\w+', text.lower())
counter = Counter(words)
top_30_words = counter.most_common(30)

# Write the top 30 words to a CSV file
df = pd.DataFrame(top_30_words, columns=['word', 'count'])
df.to_csv('/content/drive/My Drive/NLP/top_30_words.csv', index=False)

# Optionally, download the CSV file to your local machine
files.download('/content/drive/My Drive/NLP/top_30_words.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer
from collections import Counter

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
file_path = '/content/drive/My Drive/NLP/output.txt'
tokens = tokenizer.tokenize(file_path)
counter = Counter(tokens)
top_30_tokens = counter.most_common(30)
print(top_30_tokens)

[('/', 5), ('drive', 2), ('content', 1), ('my', 1), ('nl', 1), ('##p', 1), ('output', 1), ('.', 1), ('tx', 1), ('##t', 1)]




In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz
!pip install transformers
!pip install torch


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz (17.0 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz (70.1 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import spacy
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
import pandas as pd
from collections import Counter

# Load SpaCy models
nlp_sci = spacy.load("en_core_sci_sm")
nlp_bc5cdr = spacy.load("en_ner_bc5cdr_md")

# Load BioBERT
tokenizer_biobert = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model_biobert = BertForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
nlp_biobert = pipeline("ner", model=model_biobert, tokenizer=tokenizer_biobert)

def extract_entities(text, nlp_model, label):
    """
    Extract entities of a specific label using SpaCy.
    """
    # Split text into chunks
    chunk_size = 1000000  # 1 million characters
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    entities = []
    for chunk in chunks:
        doc = nlp_model(chunk)
        entities.extend([ent.text for ent in doc.ents if ent.label_ == label])

    return entities

def extract_entities_biobert(text, label):
    """
    Extract entities using BioBERT.
    """
    # Split text into chunks
    chunk_size = 512  # Token limit for BERT models
    tokens = tokenizer_biobert.tokenize(text)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    entities = []
    for chunk in chunks:
        chunk_text = tokenizer_biobert.convert_tokens_to_string(chunk)
        results = nlp_biobert(chunk_text)
        entities.extend([ent['word'] for ent in results if ent['entity'].startswith(label)])

    return entities

# Read text from file
file_path = '/content/drive/My Drive/NLP/output.txt'
with open(file_path, 'r') as file:
    text = file.read()

# Extract entities
diseases_sci = extract_entities(text, nlp_sci, 'DISEASE')
drugs_sci = extract_entities(text, nlp_sci, 'DRUG')

diseases_bc5cdr = extract_entities(text, nlp_bc5cdr, 'DISEASE')
drugs_bc5cdr = extract_entities(text, nlp_bc5cdr, 'DRUG')

diseases_biobert = extract_entities_biobert(text, 'B-DISEASE')
drugs_biobert = extract_entities_biobert(text, 'B-DRUG')

# Compare results
def compare_entities(entities1, entities2):
    counter1 = Counter(entities1)
    counter2 = Counter(entities2)
    common_entities = counter1 & counter2
    unique_entities1 = counter1 - counter2
    unique_entities2 = counter2 - counter1
    return counter1, counter2, common_entities, unique_entities1, unique_entities2

# Perform comparisons
diseases_sci_counter, diseases_bc5cdr_counter, common_diseases, unique_diseases_sci, unique_diseases_bc5cdr = compare_entities(diseases_sci, diseases_bc5cdr)
drugs_sci_counter, drugs_bc5cdr_counter, common_drugs, unique_drugs_sci, unique_drugs_bc5cdr = compare_entities(drugs_sci, drugs_bc5cdr)

diseases_biobert_counter, common_diseases_biobert, unique_diseases_sci_biobert, unique_diseases_bc5cdr_biobert = compare_entities(diseases_sci, diseases_biobert)
drugs_biobert_counter, common_drugs_biobert, unique_drugs_sci_biobert, unique_drugs_bc5cdr_biobert = compare_entities(drugs_sci, drugs_biobert)

# Print results
print("Diseases detected by SciSpacy:", len(diseases_sci))
print("Diseases detected by BC5CDR:", len(diseases_bc5cdr))
print("Diseases detected by BioBERT:", len(diseases_biobert))

print("\nDrugs detected by SciSpacy:", len(drugs_sci))
print("Drugs detected by BC5CDR:", len(drugs_bc5cdr))
print("Drugs detected by BioBERT:", len(drugs_biobert))

print("\nCommon diseases between SciSpacy and BC5CDR:", common_diseases)
print("Unique diseases in SciSpacy:", unique_diseases_sci)
print("Unique diseases in BC5CDR:", unique_diseases_bc5cdr)

print("\nCommon drugs between SciSpacy and BC5CDR:", common_drugs)
print("Unique drugs in SciSpacy:", unique_drugs_sci)
print("Unique drugs in BC5CDR:", unique_drugs_bc5cdr)

print("\nCommon diseases between SciSpacy and BioBERT:", common_diseases_biobert)
print("Unique diseases in SciSpacy compared to BioBERT:", unique_diseases_sci_biobert)
print("Unique diseases in BioBERT compared to SciSpacy:", unique_diseases_bc5cdr_biobert)

print("\nCommon drugs between SciSpacy and BioBERT:", common_drugs_biobert)
print("Unique drugs in SciSpacy compared to BioBERT:", unique_drugs_sci_biobert)
print("Unique drugs in BioBERT compared to SciSpacy:", unique_drugs_bc5cdr_biobert)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz (120.2 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_ner_bc5cdr_md
  Building wheel for en_ner_bc5cdr_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_ner_bc5cdr_md: filename=en_ner_bc5cdr_md-0.5.0-py3-none-any.whl size=120215835 sha256=3da2b6a9f7f90f5bd8a05a0397f1ef952701c36a415ba53cacc24a7be4da85c4
  Stored in directory: /root/.cache/pip/wheels/44/e8/99/517b2d53bb44945cf7a96208d44bae722e13f028736a1f1f4f
Successfully built en_ner_bc5