In [1]:
import zipfile
with zipfile.ZipFile("/content/drive/MyDrive/Classroom/syab_ner.zip", 'r') as zip_ref:
        zip_ref.extractall("/content")

Task 1

In [2]:
import os
import pandas as pd

def extract_text_from_csvs(directory, output_file):
    text_columns = ['SHORT-TEXT', 'TEXT']
    text_data = []

    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            print(f"\nProcessing file: {file}")
            df = pd.read_csv(file_path)
            found_column = None
            for col in text_columns:
                if col in df.columns:
                    found_column = col
                    break

            if found_column:
                non_empty_text = df[found_column].dropna()
                non_empty_text = non_empty_text[non_empty_text.str.strip() != '']
                if not non_empty_text.empty:
                    print(f"Found non-empty text in '{found_column}' column of {file}")
                    text_data.extend(non_empty_text.tolist())
                else:
                    print(f"'{found_column}' column in {file} is empty. Skipping...")
            else:
                print(f"No matching text column found in {file}. Skipping...")

    if text_data:
        with open(output_file, 'w', encoding='utf-8') as f:
            for line in text_data:
                f.write(line + "\n")
        print(f"\nText extracted and saved to {output_file}")
    else:
        print("\nNo non-empty text found in any file.")

directory = '/content'
output_file = 'combined_texts.txt'
extract_text_from_csvs(directory, output_file)


Processing file: CSV4.csv
Found non-empty text in 'TEXT' column of CSV4.csv

Processing file: CSV3.csv
Found non-empty text in 'TEXT' column of CSV3.csv

Processing file: CSV2.csv
Found non-empty text in 'TEXT' column of CSV2.csv

Processing file: CSV1.csv
Found non-empty text in 'SHORT-TEXT' column of CSV1.csv

Text extracted and saved to combined_texts.txt


Task 2


In [6]:
! pip install spacy

! pip install scispacy
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz

! pip install transformers
! pip install torch


Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl.metadata (16 kB)
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pybind11<2.6.2 (from nmslib>=1.7.3.6->scispacy)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl.metadata (8.7 kB)
Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz (125.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.1/125.1 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_ner_bc5cdr_md
  Building wheel for en_ner_bc5cdr_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_ner_bc5cdr_md: filename=en_ner_bc5cdr_md-0.4.0-py3-none-any.whl size=125666862 sha256=d8bdeaa9302410d5d6d65c1d672c47658183d3aadba8bc653c904cd5e9ba6ec2
  Stored in directory: /root/.cache/pip/wheels/c3/f5/32/313d08b812c91abeb6fb1d3b0f8fd69687c30c3a9d38288e4c
Successfully built en_ner_bc5cdr_md
Installing collected packages: en_ner_bc5cdr_md
Successfully installed en_ner_bc5cdr_md-0.4.0


Preprocessing- Removing stopwords and punctuation

In [None]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess_text(input_file, output_file):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words and word not in punctuation]
    cleaned_text = ' '.join(cleaned_words)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

# creating another cleaned text file version by removing stopwords and punctuation
input_file = 'combined_texts.txt'
output_file = 'cleaned_text.txt'
preprocess_text(input_file, output_file)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Task 3.1

In [3]:
from collections import Counter
import csv
import re

def is_valid_word(word):
    # Check if the word contains at least one letter
    return bool(re.search('[a-zA-Z]', word))

def clean_word(word):
    # Remove special characters, keeping letters and numbers
    word = re.sub(r'[^a-zA-Z0-9]', '', word)
    # Convert to lowercase
    return word.lower()

def count_top_words(text_file, output_csv, n=30):
    with open(text_file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Split into words, clean, and filter
    words = text.split()
    words = [clean_word(word) for word in words if is_valid_word(word)]

    word_counts = Counter(words)
    top_n_words = word_counts.most_common(n)

    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Word', 'Count'])
        writer.writerows(top_n_words)

text_file = '/content/drive/MyDrive/shihab_ner/cleaned_text.txt'
output_csv = 'top_30_words.csv'
count_top_words(text_file, output_csv)

Task 3.2


In [None]:
from transformers import AutoTokenizer

from collections import Counter

def count_unique_tokens_in_chunks(model_name, text_file, chunk_size=100000):

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    token_counts = Counter()

    with open(text_file, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            tokens = tokenizer.tokenize(chunk)
            token_counts.update(tokens)

    top_30_tokens = token_counts.most_common(30)
    return top_30_tokens

model_name = 'dmis-lab/biobert-base-cased-v1.1'
top_30_tokens = count_unique_tokens_in_chunks(model_name, 'cleaned_text.txt')
print(top_30_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[('*', 12189445), ('.', 8664470), ('-', 5886183), (':', 4056236), (',', 3656739), (']', 2774288), ('[', 2774146), (')', 2240298), ('(', 2191960), ('p', 2051941), ('c', 1391393), ('1', 1326035), ('s', 1190697), ('##t', 1121930), ('h', 1098371), ('2', 1067324), ('/', 1037316), ('##g', 1027279), ('##s', 1026796), ('##c', 1020957), ('name', 950920), ('m', 885848), ('##r', 815703), ('mg', 801954), ('3', 742886), ('patient', 726625), ('##d', 710858), ('5', 701630), ('##l', 666635), ('##b', 653322)]


In [None]:
import spacy
import torch
from tqdm import tqdm

spacy.prefer_gpu()
print(f"GPU available: {spacy.require_gpu()}")

nlp_sci = spacy.load("en_core_sci_sm")
nlp_bc5cdr = spacy.load("en_ner_bc5cdr_md")

nlp_sci.max_length = 2000000
nlp_bc5cdr.max_length = 2000000

def extract_entities_in_chunks(text_file, chunk_size=50000):
    sci_entities = []
    bc5cdr_entities = []

    with open(text_file, 'r', encoding='utf-8') as f:
        text = f.read()

    total_chunks = (len(text) + chunk_size - 1) // chunk_size

    for i in tqdm(range(0, len(text), chunk_size), total=total_chunks, desc="Processing chunks"):
        chunk = text[i:i+chunk_size]
        doc_sci = nlp_sci(chunk)
        doc_bc5cdr = nlp_bc5cdr(chunk)

        sci_chunk_entities = [(ent.text, ent.label_) for ent in doc_sci.ents]
        bc5cdr_chunk_entities = [(ent.text, ent.label_) for ent in doc_bc5cdr.ents]

        sci_entities.extend(sci_chunk_entities)
        bc5cdr_entities.extend(bc5cdr_chunk_entities)

    return sci_entities, bc5cdr_entities

if __name__ == "__main__":
    input_file = '/content/drive/MyDrive/shihab_ner/cleaned_text.txt'
    sci_entities, bc5cdr_entities = extract_entities_in_chunks(input_file)

    print(f"Total SciSpacy Entities: {len(sci_entities)}")
    print(f"Total BC5CDR Entities: {len(bc5cdr_entities)}")

    print("\nSample SciSpacy Entities:", sci_entities[:10])
    print("\nSample BC5CDR Entities:", bc5cdr_entities[:10])

    with open('sci_entities.txt', 'w', encoding='utf-8') as f:
        for entity, label in sci_entities:
            f.write(f"{entity}\t{label}\n")

    with open('bc5cdr_entities.txt', 'w', encoding='utf-8') as f:
        for entity, label in bc5cdr_entities:
            f.write(f"{entity}\t{label}\n")

  _C._set_default_tensor_type(t)


GPU available: True


Processing chunks:   1%|          | 99/11719 [03:37<6:34:16,  2.04s/it]