In [1]:
import nltk
import os
import shutil
import subprocess
import sys
import pkg_resources
from pkg_resources import DistributionNotFound, VersionConflict

In [2]:
def install_if_missing(requirement):
    try:
        pkg_resources.require(requirement)
    except (DistributionNotFound, VersionConflict):
        subprocess.check_call([sys.executable, "-m", "pip", "install", requirement])

# Set up NLTK data path
nltk_data_path = '/mnt/data/skanda/MSc_IRD_LLM'
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Install required packages
required_packages = [
    'pydantic==2.8.2',
    'spacy==3.7.5',
    'scispacy==0.5.4',
    'nltk',
    'tqdm'
]

for package in required_packages:
    install_if_missing(package)

# Download NLTK resources
nltk_resources = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger', 'brown']
for resource in nltk_resources:
    nltk.download(resource, download_dir=nltk_data_path, quiet=False)

# Ensure punkt_tab is present
punkt_dir = os.path.join(nltk_data_path, 'tokenizers', 'punkt')
punkt_tab_dir = os.path.join(nltk_data_path, 'tokenizers', 'punkt_tab')
if os.path.exists(punkt_dir) and not os.path.exists(punkt_tab_dir):
    shutil.copytree(punkt_dir, punkt_tab_dir)

# Install spaCy model
model_name = "en_core_sci_md"
model_version = "0.5.4"  # Make sure this matches your sciSpaCy version
model_url = f"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v{model_version}/{model_name}-{model_version}.tar.gz"
print(f"Installing {model_name} from {model_url}")
subprocess.check_call([sys.executable, "-m", "pip", "install", model_url])

# Print setup information
print("\nNLTK resources downloaded successfully.")
print("NLTK data directory contents:")
for root, dirs, files in os.walk(nltk_data_path):
    level = root.replace(nltk_data_path, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/data/skanda/MSc_IRD_LLM...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/data/skanda/MSc_IRD_LLM...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/data/skanda/MSc_IRD_LLM...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /mnt/data/skanda/MSc_IRD_LLM...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     /mnt/data/skanda/MSc_IRD_LLM...
[nltk_data]   Package brown is already up-to-date!


Installing en_core_sci_md from https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz (119.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'

NLTK resources downloaded successfully.
NLTK data directory contents:
MSc_IRD_LLM/
    .gitignore
    pre_processing.ipynb
    .DS_Store
    pdf_to_txt.ipynb
    LICENSE
    list_of_papers.csv
    README.md
    results_facebook_bart/
        summaries.txt
        scores.txt
    tokenizers/
        punkt.zip
        punkt/
            polish.pickle
            norwegian.pickle
            french.pickle
            danish.pickle
            .DS_Store
            swedish.pickle
            german.pickle
            estonian.pickle
          

In [3]:
# Test NLTK
try:
    print("\nTesting NLTK installation:")
    print(nltk.word_tokenize("Testing NLTK installation."))
    print("NLTK is working correctly!")
except Exception as e:
    print(f"Error during tokenization: {e}")

# Print NLTK data path
print("\nNLTK data path:")
print(nltk.data.path)


Testing NLTK installation:
['Testing', 'NLTK', 'installation', '.']
NLTK is working correctly!

NLTK data path:
['/home/skanda/nltk_data', '/mnt/data/skanda/mambaforge/envs/gpu/nltk_data', '/mnt/data/skanda/mambaforge/envs/gpu/share/nltk_data', '/mnt/data/skanda/mambaforge/envs/gpu/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/mnt/data/skanda/MSc_IRD_LLM']


In [4]:
# !pip install pint

In [5]:
# Test spaCy
import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker


print(f"\nspaCy version: {spacy.__version__}")
print(f"sciSpaCy version: {scispacy.__version__}")
try:
    nlp = spacy.load(model_name)
    print(f"Successfully loaded {model_name}")

    # Add UMLS Entity Linker (for mapping to medical concepts)
    linker = UmlsEntityLinker()
    nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True})

except OSError as e:
    print(f"Error loading {model_name}: {e}")
    print("Please check if the installation was successful.")

print("\nSetup completed.")

  from .autonotebook import tqdm as notebook_tqdm
Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib



spaCy version: 3.7.5
sciSpaCy version: 0.5.4


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Successfully loaded en_core_sci_md


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Setup completed.
