In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get update && apt-get install -y poppler-utils

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
            Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,801 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,750 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4,587 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,067 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Pack

In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [4]:
import stanza
import numpy as np
import re
from collections import defaultdict
import uuid

In [5]:
stanza.download('ta', verbose=False)
stanza.download('en', verbose=False)
nlp_tamil = stanza.Pipeline('ta', processors='tokenize,pos', verbose=False)
nlp_english = stanza.Pipeline('en', processors='tokenize,pos,ner', verbose=False)

In [6]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'
text_output_path = 'extracted_text.txt'
!pdftotext "{pdf_path}" "{text_output_path}"

In [7]:
def apply_nlp(text, language='en'):
    nlp = nlp_english if language == 'en' else nlp_tamil
    doc = nlp(text)
    return [
        [{
            'text': token.text,
            'pos': token.pos,
            # Access 'ner' attribute only if it exists for the token
            'ner': token.ner if hasattr(token, 'ner') else 'O'
        } for token in sent.words]
        for sent in doc.sentences
    ]

In [8]:
def is_boilerplate_line(line):
    common_noise = [
    r'உடம்பதாட\.',
    r'https://pandianeducationaltrust\.com/-chenkaantal\.html\.',
    r'திருக்குறள்\.',
    r'ஓராண்டிற்கு',
    r'©\.'
]
    return any(re.search(p, line, re.IGNORECASE) for p in common_noise)

In [9]:
def pos_tag_with_stanza(text):
    doc = nlp(text)
    tagged = []
    for sent in doc.sentences:
        for word in sent.words:
            tagged.append((word.text, word.upos))
    return tagged

In [10]:
def remove_boilerplate(text):
    boilerplate_patterns = [
        r'Bi-Yearly Peer-Reviewed Tamil Jowrmal',
        r'Volume - \d+, Issue - \d+, [A-Za-z]+ \d{4}',
        r'E-ISSN: \d{4}-\d{4}',
        r'DOI: 10\.\d+/zenodo\.\d+',
        r'Received \d+ [A-Za-z]+ \d{4};.*Available online \d+ [A-Za-z]+ \d{4}\.',
        r'Author Contribution Statement:.*?\n',
        r'Author Acknowledgement:.*?\n',
        r'Author Declaration:.*?\n',
        r'\(6\) The content of the article is licensed under.*?\n',
        r'Be Eco-Friendly',
        r'Available at: http://nandianeducationaltrust\.com/chenkaantal\.html',
        r'ORCID: https://orcid\.org/\d{4}-\d{4}-\d{4}-\d{4}',
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text.strip()

In [11]:
def remove_non_tamil_content(text):
    # Keep Tamil (Unicode range: \u0B80-\u0BFF), English (ASCII), and basic punctuation
    pattern = r'[^\u0B80-\u0BFFa-zA-Z0-9\s.,;:"\'()-\[\]]'
    text = re.sub(pattern, '', text)
    return text

In [12]:
def remove_numbers_except_years(text):
    # Replace numbers not part of a 4-digit year (1900-2099)
    text = re.sub(r'\b(?!19\d{2}|20\d{2})\d+\b', '', text)
    return text

In [13]:
def process_nlp(text, lang='ta'):
    nlp = nlp_ta
    doc = nlp(text)
    sentences = []
    for sent in doc.sentences:
        tokens = []
        for token in sent.words:
            tokens.append({
                'text': token.text,
                'pos': token.pos,
                'ner': token.ner if hasattr(token, 'ner') else 'O'
            })
        sentences.append(tokens)
    return sentences

In [14]:
def rule_based_ner(sentences):
    literature_entities = ['Thirukkural', 'Tolkappiyam', 'Tholkappiya Porulathikaram', 'Akananooru']
    for sentence in sentences:
        for token in sentence:
            if token['text'] in literature_entities:
                token['ner'] = 'LITERATURE'
    return sentences

In [15]:
def entity_linking(sentences):
    knowledge_base = {
        'Thirukkural': 'A classic Tamil text by Thiruvalluvar',
        'Tolkappiyam': 'An ancient Tamil grammar and literature text',
        'Tholkappiya Porulathikaram': 'A section of Tolkappiyam on poetics',
        'Akananooru': 'A classical Tamil poetic work'
    }
    for sentence in sentences:
        for token in sentence:
            if token['ner'] == 'LITERATURE' and token['text'] in knowledge_base:
                token['entity_link'] = knowledge_base[token['text']]
    return sentences

In [16]:
def deduplicate_sentences(sentences):
    seen = set()
    deduped = []
    for sentence in sentences:
        sent_text = ' '.join([token['text'] for token in sentence])
        if sent_text not in seen:
            seen.add(sent_text)
            deduped.append(sentence)
    return deduped

In [17]:
def nlp_pipeline(text):
    text = remove_boilerplate(text)
    text = remove_non_tamil_content(text)
    text = remove_numbers_except_years(text)
    sentences = process_nlp(text, lang='ta')
    sentences = rule_based_ner(sentences)
    sentences = entity_linking(sentences)
    sentences = deduplicate_sentences(sentences)
    return sentences

In [18]:
def process_nlp(text, lang='ta'):
    # Use nlp_tamil for Tamil language processing
    nlp = nlp_tamil
    doc = nlp(text)
    sentences = []
    for sent in doc.sentences:
        tokens = []
        for token in sent.words:
            tokens.append({
                'text': token.text,
                'pos': token.pos,
                'ner': token.ner if hasattr(token, 'ner') else 'O'
            })
        sentences.append(tokens)
    return sentences

In [19]:
with open(text_output_path, 'r', encoding='utf-8') as f:
    pdf_text = f.read()

In [20]:
processed_sentences = nlp_pipeline(pdf_text)

In [21]:
for i, sentence in enumerate(processed_sentences):
    print(f"Sentence {i+1}:")
    for token in sentence:
        print(f"Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}")
    print()

Sentence 1:
Token: -, POS: PUNCT, NER: O, Entity Link: None
Token: -, POS: PUNCT, NER: O, Entity Link: None
Token: (, POS: PUNCT, NER: O, Entity Link: None
Token: -, POS: PUNCT, NER: O, Entity Link: None
Token: ), POS: PUNCT, NER: O, Entity Link: None
Token: kalkiyi, POS: PROPN, NER: O, Entity Link: None
Token: n2, POS: PROPN, NER: O, Entity Link: None
Token: ponniyin, POS: PROPN, NER: O, Entity Link: None
Token: celvan, POS: PROPN, NER: O, Entity Link: None
Token: part-1, POS: NUM, NER: O, Entity Link: None
Token: a, POS: PUNCT, NER: O, Entity Link: None
Token: putu, POS: PUNCT, NER: O, Entity Link: None
Token: veLLam, POS: PROPN, NER: O, Entity Link: None
Token: (, POS: PUNCT, NER: O, Entity Link: None
Token: chapters, POS: PROPN, NER: O, Entity Link: None
Token: -, POS: PUNCT, NER: O, Entity Link: None
Token: ), POS: PUNCT, NER: O, Entity Link: None
Token: in, POS: PUNCT, NER: O, Entity Link: None
Token: tamil, POS: PUNCT, NER: O, Entity Link: None
Token: script, POS: NOUN, NER: O, 

In [22]:
with open(text_output_path, 'r', encoding='utf-8') as f:
    pdf_text = f.read()

In [23]:
# Execute Pipeline
processed_data = nlp_pipeline(pdf_path)

In [24]:
# Display Results
for idx, sentence in enumerate(processed_data, 1):
    print(f"Sentence {idx}:")
    for token in sentence:
        print(f"  Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}")
    print()

Sentence 1:
  Token: /content/drive/MyDrive/Journals/Ponniyan, POS: PART, NER: O, Entity Link: None
  Token: Selvan, POS: PROPN, NER: O, Entity Link: None
  Token: Part1.pd, POS: PROPN, NER: O, Entity Link: None
  Token: f, POS: PUNCT, NER: O, Entity Link: None



In [25]:
with open('nlp_output.txt', 'w', encoding='utf-8') as output_file:
    for idx, sentence in enumerate(processed_data, 1):
        output_file.write(f"Sentence {idx}:\n")
        for token in sentence:
            output_file.write(f"  Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}\n")
        output_file.write("\n")

In [26]:
!pip install indic-nlp-library pdfplumber wikipedia-api --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m


In [27]:
!python -m indicnlp.resources.manager download_resources

/usr/bin/python3: Error while finding module specification for 'indicnlp.resources.manager' (ModuleNotFoundError: No module named 'indicnlp.resources')


In [28]:
import pdfplumber
import re, wikipediaapi
from indicnlp import loader
from indicnlp.tokenize import sentence_tokenize, indic_tokenize

In [29]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126 (from 1)[K
Receiving objects: 100% (139/139), 149.77 MiB | 24.19 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.


In [30]:
INDIC_NLP_LIB_HOME = '/usr/local/lib/python3.11/dist-packages/indicnlp'
INDIC_NLP_RESOURCES = '/content/indic_nlp_resources'

In [31]:
GAZETTEER_PERSONS = {'அருள்மொழி', 'வந்தியத்தேவன்', 'நந்தினி'}
GAZETTEER_PLACES = {'தஞ்சாவூர்', 'காஞ்சிபுரம்'}

In [32]:
BOILERPLATE_PATTERNS = [
    r'Project Madurai.*?\n', r'Etext.*?\n', r'www.*?\s', r'https?://\S+'
]

In [33]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'

In [34]:
def extract_text(pdf):
    text = ''
    with pdfplumber.open(pdf) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += '\n' + page_text
    return text

In [35]:
def remove_boilerplate(text):
    for pattern in BOILERPLATE_PATTERNS:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text

In [36]:
def remove_non_tamil(text):
    return ' '.join([w for w in text.split() if re.search(r'[\u0B80-\u0BFF]', w)])

In [37]:
def remove_numbers_but_keep_years(text):
    return ' '.join([w for w in text.split() if not (w.isdigit() and not 1000 <= int(w) <= 2100)])

In [38]:
NOUN_SUFFIXES, VERB_SUFFIXES, PRONOUNS = ['க்கு', 'இல்'], ['கிறான்'], ['நான்', 'நீ']

In [39]:
def pos_tag(token):
    if token in PRONOUNS: return 'PRONOUN'
    if any(token.endswith(s) for s in NOUN_SUFFIXES): return 'NOUN'
    if any(token.endswith(s) for s in VERB_SUFFIXES): return 'VERB'
    return 'OTHER'

In [40]:
def ner_tag(token, next_token=None):
    if token in GAZETTEER_PERSONS: return 'PERSON'
    if token in GAZETTEER_PLACES: return 'LOCATION'
    if next_token in ['நகரம்', 'மாவட்டம்']: return 'POSSIBLE_LOCATION'
    return 'O'

In [41]:
wiki_ta = wikipediaapi.Wikipedia(user_agent='MyTamilNLPApp/1.0 (https://example.com/myappinfo)', language='ta')

In [42]:
def entity_link(entity):
    page = wiki_ta.page(entity)
    return page.fullurl if page.exists() else None

In [43]:
def deduplicate_sentences(sent_list):
    return list(dict.fromkeys(sent_list))

In [44]:
text = extract_text(pdf_path)
text = remove_boilerplate(text)
text = remove_non_tamil(text)
text = remove_numbers_but_keep_years(text)
sentences = sentence_tokenize.sentence_split(text, lang='ta')
sentences = deduplicate_sentences(sentences)

In [45]:
# Deduplication
def deduplicate_sentences(sentences):
    return list(dict.fromkeys(sentences))

In [46]:
for sent in sentences:
    tokens = indic_tokenize.trivial_tokenize(sent, lang='ta')
    print(f"\nSentence: {sent.strip()}")
    for idx, tok in enumerate(tokens):
        pos = pos_tag(tok)
        next_tok = tokens[idx+1] if idx+1 < len(tokens) else None
        ner = ner_tag(tok, next_tok)
        link = entity_link(tok) if ner in ['PERSON', 'LOCATION'] else None
        print(f"{tok:15} | POS: {pos:8} | NER: {ner:10} | Link: {link}")

In [47]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [48]:
!pip install pdfplumber pytesseract indic-nlp-library wikipedia-api --quiet

In [49]:
import pdfplumber
import pytesseract
import re, wikipediaapi
from PIL import Image
from indicnlp import loader
from indicnlp.tokenize import sentence_tokenize, indic_tokenize

In [51]:
INDIC_NLP_LIB_HOME = '/usr/local/lib/python3.11/dist-packages/indicnlp'
INDIC_NLP_RESOURCES = '/usr/local/lib/python3.11/dist-packages/indicnlp/resources'

In [52]:
GAZETTEER_PERSONS = {'அருள்மொழி', 'வந்தியத்தேவன்', 'நந்தினி'}
GAZETTEER_PLACES = {'தஞ்சாவூர்', 'காஞ்சிபுரம்'}

In [67]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'

In [54]:
def extract_text_with_ocr(pdf):
    text = ''
    with pdfplumber.open(pdf) as pdf:
        for page in pdf.pages:
            # Try text layer first
            page_text = page.extract_text()
            if page_text and len(page_text.strip()) > 20:
                text += '\n' + page_text
            else:
                print(f"Page {page.page_number} has no text — using OCR.")
                pil_image = page.to_image(resolution=300).original
                ocr_text = pytesseract.image_to_string(pil_image, lang='tam')
                text += '\n' + ocr_text
    return text

In [55]:
def remove_boilerplate(text):
    patterns = [r'Project Madurai.*?\n', r'Etext.*?\n', r'www.*?\s', r'https?://\S+']
    for p in patterns: text = re.sub(p, '', text, flags=re.IGNORECASE)
    return text

In [56]:
def remove_non_tamil(text):
    return ' '.join([w for w in text.split() if re.search(r'[\u0B80-\u0BFF]', w)])

In [57]:
def remove_numbers_but_keep_years(text):
    return ' '.join([w for w in text.split() if not (w.isdigit() and not 1000 <= int(w) <= 2100)])

In [58]:
NOUN_SUFFIXES, VERB_SUFFIXES, PRONOUNS = ['க்கு', 'இல்'], ['கிறான்'], ['நான்', 'நீ']

In [59]:
def pos_tag(token):
    if token in PRONOUNS: return 'PRONOUN'
    if any(token.endswith(s) for s in NOUN_SUFFIXES): return 'NOUN'
    if any(token.endswith(s) for s in VERB_SUFFIXES): return 'VERB'
    return 'OTHER'

In [60]:
def ner_tag(token, next_token=None):
    if token in GAZETTEER_PERSONS: return 'PERSON'
    if token in GAZETTEER_PLACES: return 'LOCATION'
    if next_token in ['நகரம்', 'மாவட்டம்']: return 'POSSIBLE_LOCATION'
    return 'O'

In [63]:
wiki_ta = wikipediaapi.Wikipedia(user_agent='MyTamilNLPApp/1.0 (https://example.com/myappinfo)', language='ta')

In [64]:
def entity_link(entity):
    page = wiki_ta.page(entity)
    return page.fullurl if page.exists() else None

In [65]:
def deduplicate_sentences(sents):
  return list(dict.fromkeys(sents))

In [68]:
text = extract_text_with_ocr(pdf_path)
text = remove_boilerplate(text)
text = remove_non_tamil(text)
text = remove_numbers_but_keep_years(text)
sentences = sentence_tokenize.sentence_split(text, lang='ta')
sentences = deduplicate_sentences(sentences)

In [69]:
for sent in sentences:
    tokens = indic_tokenize.trivial_tokenize(sent, lang='ta')
    print(f"\nSentence: {sent.strip()}")
    for idx, tok in enumerate(tokens):
        pos = pos_tag(tok)
        next_tok = tokens[idx+1] if idx+1 < len(tokens) else None
        ner = ner_tag(tok, next_tok)
        link = entity_link(tok) if ner in ['PERSON', 'LOCATION'] else None
        print(f"{tok:15} | POS: {pos:8} | NER: {ner:10} | Link: {link}")