In [None]:
!pip install requests beautifulsoup4 lxml



In [None]:
import os, time, json, unicodedata
from collections import deque
from urllib.parse import urljoin, urlparse, urldefrag
import requests
from bs4 import BeautifulSoup

SEED_URLS = [
    "https://ta.wikisource.org/wiki/%E0%AE%AE%E0%AF%81%E0%AE%95%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AF%81",
    "https://www.tamilvu.org/",
    "https://www.academicjournals.org/",
    "https://www.tamilmanam.net"
]

OUT_FILE = "data_raw.jsonl"
HEADERS = {"User-Agent": "TamilCorpusCollector/0.2 (academic use; contact: your@email)"}

MAX_DEPTH = 2              # 0 = only seed pages
MAX_PAGES_TOTAL = 300      # hard stop
REQUEST_TIMEOUT = 15
DELAY_SECONDS = 0.5        # politeness delay between requests

ALLOWED_HOSTS = {urlparse(u).netloc for u in SEED_URLS}

def same_site(url: str) -> bool:
    host = urlparse(url).netloc
    return any(host.endswith(h) for h in ALLOWED_HOSTS)

def normalize(base: str, href: str) -> str:
    # join, drop URL fragment (#...), strip trailing spaces
    url = urljoin(base, href or "").strip()
    url, _frag = urldefrag(url)
    return url

def is_html(resp: requests.Response) -> bool:
    ctype = resp.headers.get("content-type", "").lower()
    return "text/html" in ctype

def looks_like_doc(url: str) -> bool:
    # Allow folders and common HTML-ish endings; exclude auth/cart/etc.
    u = url.lower()
    if any(s in u for s in ["login", "signup", "account", "cart", "pdfviewer"]):
        return False
    return u.endswith(("/", ".html", ".htm", ".php")) or ("?" in u)  # allow query pages

def clean_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for bad in soup(["script", "style", "noscript", "header", "footer", "nav", "form"]):
        bad.decompose()
    text = soup.get_text("\n", strip=True)
    return unicodedata.normalize("NFC", text)

def save_record(source_url: str, title: str, text: str):
    rec = {
        "title": (title or "").strip(),
        "author": "",
        "source": source_url,
        "text": text,
    }
    with open(OUT_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

def crawl():
    os.makedirs(".", exist_ok=True)
    seen = set()
    q = deque((u, 0) for u in SEED_URLS)
    pages = 0
    session = requests.Session()

    while q and pages < MAX_PAGES_TOTAL:
        url, depth = q.popleft()
        if url in seen or depth > MAX_DEPTH or not same_site(url):
            continue
        seen.add(url)

        try:
            r = session.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            r.raise_for_status()
        except Exception as e:
            print("skip:", url, "|", e)
            continue

        if not is_html(r):
            continue

        soup = BeautifulSoup(r.text, "lxml")
        title = soup.title.get_text(strip=True) if soup.title else ""

        if looks_like_doc(url):
            text = clean_text(r.text)
            if len(text) > 200:          # keep only substantive pages
                save_record(url, title, text)
                pages += 1
                print(f"[{pages}] saved:", title or url)

        # queue next links (same site only)
        if depth < MAX_DEPTH:
            for a in soup.find_all("a", href=True):
                nxt = normalize(url, a["href"])
                if nxt.startswith(("mailto:", "javascript:")):
                    continue
                if same_site(nxt) and nxt not in seen:
                    q.append((nxt, depth + 1))

        time.sleep(DELAY_SECONDS)

    print("Done. Pages saved:", pages, "| File:", OUT_FILE)

if __name__ == "__main__":
    crawl()


[1] saved: தமிழ் இணையக் கல்விக்கழகம் TAMIL VIRTUAL ACADEMY | தேமதுரத்  தமிழோசை உலகமெலாம் பரவும் வகை செய்தல் வேண்டும்
skip: https://www.academicjournals.org/ | 403 Client Error: Forbidden for url: https://www.academicjournals.org/
skip: https://www.tamilmanam.net | 404 Client Error: Not Found for url: https://www.tamilmanam.net/
skip: https://ta.wikisource.org/wiki/%E0%AE%89%E0%AE%A4%E0%AE%B5%E0%AE%BF:Introduction | 404 Client Error: Not Found for url: https://ta.wikisource.org/wiki/%E0%AE%89%E0%AE%A4%E0%AE%B5%E0%AE%BF:Introduction
[2] saved: முதற் பக்கம் என்பதற்கான மூலத்தைப் பார் - விக்கிமூலம்
[3] saved: முதற் பக்கம்: திருத்த வரலாறு - விக்கிமூலம்
[4] saved: விக்கிமூலம்
[5] saved: "முதற் பக்கம்" பக்கத்துக்கான தகவல் - விக்கிமூலம்
[6] saved: மேற்கோள் காட்டு - விக்கிமூலம்
[7] saved: பயனர் தடுக்கப்பட்டுள்ளார் - விக்கிமூலம்
[8] saved: பயனர் தடுக்கப்பட்டுள்ளார் - விக்கிமூலம்
[9] saved: முகப்பு - விக்கிமூலம்
skip: https://ta.wikisource.org/wiki/%E0%AE%94%E0%AE%B5%E0%AF%88%E0%AE%AF%E0%AE%BE%E0%

In [None]:
!pip install scrapy readability-lxml

Collecting scrapy
  Downloading scrapy-2.13.3-py3-none-any.whl.metadata (4.4 kB)
Collecting readability-lxml
  Downloading readability_lxml-0.8.4.1-py3-none-any.whl.metadata (4.0 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemadapter>=0.1.0 (from scrapy)
  Downloading itemadapter-0.12.1-py3-none-any.whl.metadata (22 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting protego>=0.1.15 (from scrapy)
  Downloading protego-0.5.0-py3-none-any.whl.metadata (6.4 kB)
Collecting pydispatcher>=2.0.5 (from scrapy)
  Downloading PyDispatcher-2.0.7-py3-none-any.whl.metadata (2.4 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloadi

In [None]:
import hashlib, unicodedata
from urllib.parse import urlparse
import scrapy
from readability import Document

SEED_URLS = [
    "https://ta.wikisource.org/wiki/%E0%AE%AE%E0%AF%81%E0%AE%95%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AF%81",
    "https://www.tamilvu.org/",
    "https://www.academicjournals.org/",
    "https://www.tamilmanam.net"
]
ALLOWED = [urlparse(u).netloc for u in SEED_URLS]

class TamilSpider(scrapy.Spider):
    name = "tamil"
    allowed_domains = ALLOWED
    start_urls = SEED_URLS
    custom_settings = {
        "ROBOTSTXT_OBEY": True,
        "DOWNLOAD_DELAY": 0.5,
        "CONCURRENT_REQUESTS": 8,
        "FEEDS": {
            "data_raw_scrapy/%(name)s-%(time)s.jsonl": {"format": "jsonlines", "encoding": "utf8"},
        },
        "USER_AGENT": "TamilCorpusCollector/0.1 (academic; contact@example.com)"
    }

    def parse(self, response):
        ctype = response.headers.get("Content-Type", b"").decode().lower()
        if "text/html" in ctype:
            try:
                doc = Document(response.text)
                title = doc.short_title()
                main_html = doc.summary()
            except Exception:
                title, main_html = (response.css("title::text").get() or "").strip(), response.text

            text = scrapy.Selector(text=main_html).xpath("string()").get().strip()
            text = unicodedata.normalize("NFC", text)

            if len(text) > 200:
                yield {
                    "title": title,
                    "author": "",
                    "source": response.url,
                    "text": text,
                }

            # follow same-site links
            for href in response.css("a::attr(href)").getall():
                url = response.urljoin(href)
                host = urlparse(url).netloc
                if any(host.endswith(ad) for ad in self.allowed_domains):
                    yield scrapy.Request(url, callback=self.parse)

In [None]:
!pip install readability-lxml



In [None]:
import os, re, json, glob, unicodedata, hashlib
from collections import Counter

IN_GLOB = "data_raw/**/*.jsonl"
OUT_SENT_TXT = "clean/cleaned_tamil_corpus.txt"
OUT_JSONL = "clean/cleaned_tamil_corpus.jsonl"
STATS_PATH = "clean/stats.json"
os.makedirs("clean", exist_ok=True)

TAMIL_BLOCK = (0x0B80, 0x0BFF)

def keep_tamil_punct(s: str) -> str:
    out = []
    for ch in s:
        cp = ord(ch)
        if (TAMIL_BLOCK[0] <= cp <= TAMIL_BLOCK[1]) or ch.isspace() or ch in ".!?;:,()\"'–—-[]{}…":
            out.append(ch)
    s = "".join(out)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n\s*\n+", "\n\n", s)
    return s.strip()

def rm_headers_footers(s):
    patterns = [
        r"Etext.*?$", r"https?://\S+", r"www\.\S+",
        r"^Page\s*\d+$", r"^\s*\d+\s*$", r"Vol[-– ]\s*\w+"
    ]
    for p in patterns:
        s = re.sub(p, "", s, flags=re.MULTILINE|re.IGNORECASE)
    return re.sub(r"\n\s*\n+", "\n\n", s).strip()

def sentence_split_ta(text: str):
    # basic splitter on Tamil/Latin sentence punctuation
    parts = re.split(r"[\.!\?]|[|।]", text)
    return [p.strip() for p in parts if p.strip()]

def is_mostly_tamil(s: str, thresh=0.5):
    if not s: return False
    total = sum(ch.isalpha() for ch in s)
    ta = sum(0x0B80 <= ord(ch) <= 0x0BFF for ch in s)
    return total == 0 or (ta / max(1,total)) >= thresh

def dedup_sentences(sents):
    seen = set(); out = []
    for s in sents:
        key = re.sub(r"\s+", " ", s.lower()).strip()
        if key and key not in seen:
            seen.add(key); out.append(s)
    return out

def process_one(rec):
    text = unicodedata.normalize("NFC", rec["text"])
    text = rm_headers_footers(text)
    text = keep_tamil_punct(text)
    sents = sentence_split_ta(text)
    sents = [s for s in sents if is_mostly_tamil(s, 0.4)]
    sents = dedup_sentences(sents)
    return sents

def main():
    all_sents = []
    files = glob.glob(IN_GLOB, recursive=True)
    for fp in files:
        with open(fp, "r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                rec = json.loads(line)
                sents = process_one(rec)
                for s in sents:
                    all_sents.append({
                        "title": rec.get("title",""),
                        "author": rec.get("author",""),
                        "source": rec.get("source",""),
                        "text": s
                    })

    # global dedup by sentence text
    by_text = {}
    for r in all_sents:
        key = re.sub(r"\s+", " ", r["text"].lower())
        by_text.setdefault(key, r)

    out_list = list(by_text.values())

    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for r in out_list:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    with open(OUT_SENT_TXT, "w", encoding="utf-8") as f:
        for r in out_list:
            f.write(r["text"] + "\n")

    stats = {
        "files_read": len(files),
        "sentences_total": len(all_sents),
        "sentences_unique": len(out_list),
    }
    with open(STATS_PATH, "w", encoding="utf-8") as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)

    print("Saved:", OUT_JSONL, OUT_SENT_TXT)
    print("Stats:", stats)

if __name__ == "__main__":
    main()

Saved: clean/cleaned_tamil_corpus.jsonl clean/cleaned_tamil_corpus.txt
Stats: {'files_read': 0, 'sentences_total': 0, 'sentences_unique': 0}


In [None]:
!apt-get update && apt-get install -y poppler-utils

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 5,484 B/129 kB 4%] [Connecti                                                                               Hit:2 https://cli.github.com/packages stable InRelease
0% [Waiting for headers] [1 InRelease 129 kB/129 kB 100%] [Connected to cloud.r                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
                                                                               Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jam

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
import stanza
import numpy as np
import re
from collections import defaultdict
import uuid

In [None]:
stanza.download('ta', verbose=False)
stanza.download('en', verbose=False)
nlp_tamil = stanza.Pipeline('ta', processors='tokenize,pos', verbose=False)
nlp_english = stanza.Pipeline('en', processors='tokenize,pos,ner', verbose=False)

In [None]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'
text_output_path = 'extracted_text.txt'
!pdftotext "{pdf_path}" "{text_output_path}"

I/O Error: Couldn't open file '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf': No such file or directory.


In [None]:
def apply_nlp(text, language='en'):
    nlp = nlp_english if language == 'en' else nlp_tamil
    doc = nlp(text)
    return [
        [
            {
                'text': token.text,
                'pos': token.pos,
                # Access 'ner' only if it exists
                'ner': token.ner if hasattr(token, 'ner') else 'O'
            }
            for token in sent.words
        ]
        for sent in doc.sentences
    ]

sample_text = "Murugan is worshipped in Tamil Nadu."
result = apply_nlp(sample_text, language='en')

# Show output
for sent in result:
    for token in sent:
        print(token)


{'text': 'Murugan', 'pos': 'PROPN', 'ner': 'O'}
{'text': 'is', 'pos': 'AUX', 'ner': 'O'}
{'text': 'worshipped', 'pos': 'VERB', 'ner': 'O'}
{'text': 'in', 'pos': 'ADP', 'ner': 'O'}
{'text': 'Tamil', 'pos': 'ADJ', 'ner': 'O'}
{'text': 'Nadu', 'pos': 'PROPN', 'ner': 'O'}
{'text': '.', 'pos': 'PUNCT', 'ner': 'O'}


In [None]:
import re

def is_boilerplate_line(line: str):
    common_noise = [
        r'உடம்பதாட\.',
        r'https://pandianeducationaltrust\.com/-chenkaantal\.html\.',
        r'திருக்குறள்\.',
        r'ஓராண்டிற்கு',
        r'©\.'
    ]
    return any(re.search(p, line, re.IGNORECASE) for p in common_noise)
lines = [
    "இது ஒரு சோதனை வரி.",
    "திருக்குறள்.",
    "https://pandianeducationaltrust.com/-chenkaantal.html."
]

for l in lines:
    print(l, "->", is_boilerplate_line(l))

இது ஒரு சோதனை வரி. -> False
திருக்குறள். -> True
https://pandianeducationaltrust.com/-chenkaantal.html. -> True


In [None]:
import re

def is_boilerplate_line(line):
    common_noise = [
        r'உடம்பதாட\.',
        r'https://pandianeducationaltrust\.com/-chenkaantal\.html\.',
        r'திருக்குறள்\.',
        r'ஓராண்டிற்கு',
        r'©\.'
    ]
    return any(re.search(p, line, re.IGNORECASE) for p in common_noise)


def clean_text_lines(lines):
    return [line for line in lines if not is_boilerplate_line(line.strip())]


def clean_text_block(text):
    lines = text.splitlines()
    cleaned_lines = clean_text_lines(lines)
    return "\n".join(cleaned_lines)
raw_text = """
இது ஒரு சோதனை வரி.
திருக்குறள்.
இது மற்றொரு வரி.
https://pandianeducationaltrust.com/-chenkaantal.html.
©. Copyright line
இது இறுதி வரி.
"""

cleaned = clean_text_block(raw_text)
print(cleaned)


இது ஒரு சோதனை வரி.
இது மற்றொரு வரி.
இது இறுதி வரி.


In [None]:
import stanza
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos')

def pos_tag_with_stanza(text):
    doc = nlp(text)
    tagged = []
    for sent in doc.sentences:
        for word in sent.words:
            tagged.append((word.text, word.upos))
    return tagged

sample_text = "Murugan is worshipped in Tamil Nadu."
tags = pos_tag_with_stanza(sample_text)

for token, pos in tags:
    print(f"{token:12} -> {pos}")

Murugan      -> PROPN
is           -> AUX
worshipped   -> VERB
in           -> ADP
Tamil        -> ADJ
Nadu         -> PROPN
.            -> PUNCT


In [None]:
import re

BOILERPLATE_PATTERNS = [
    r'Bi-Yearly Peer-Reviewed Tamil Journal',
    r'Volume\s*-\s*\d+,\s*Issue\s*-\s*\d+,\s*[A-Za-z]+\s+\d{4}',
    r'E-ISSN:\s*\d{4}-\d{4}',
    r'DOI:\s*10\.\d+/zenodo\.\d+',
    r'Received\s+\d+\s+[A-Za-z]+\s+\d{4};.*?Available online\s+\d+\s+[A-Za-z]+\s+\d{4}\.',
    r'Author Contribution Statement:.*?(?=\n\n|\Z)',
    r'Author Acknowledgement:.*?(?=\n\n|\Z)',
    r'Author Declaration:.*?(?=\n\n|\Z)',
    r'\(6\)\s*The content of the article is licensed under.*?(?=\n\n|\Z)',
    r'Be Eco-Friendly',
    r'Available at:\s*https?://\S+',
    r'ORCID:\s*https://orcid\.org/\d{4}-\d{4}-\d{4}-\d{4}',
]
_COMPILED = [re.compile(p, re.IGNORECASE | re.MULTILINE | re.DOTALL) for p in BOILERPLATE_PATTERNS]

def remove_boilerplate(text):
    if not isinstance(text, str):
        return ""
    for rx in _COMPILED:
        text = rx.sub("", text)
    # collapse multiple blank lines
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    return text.strip()

sample = """
Bi-Yearly Peer-Reviewed Tamil Journal
Volume - 12, Issue - 2, March 2023
E-ISSN: 1234-5678
DOI: 10.1234/zenodo.56789

Author Contribution Statement: The author did everything.

Main article content starts here.
"""
cleaned = remove_boilerplate(sample)
print("Preview:\n", cleaned)

in_path  = "/content/drive/MyDrive/tamil_data.txt"
out_path = "/content/drive/MyDrive/tamil_data_cleaned.txt"

with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
    raw = f.read()

cleaned = remove_boilerplate(raw)

with open(out_path, "w", encoding="utf-8") as f:
    f.write(cleaned)

print(f"Saved cleaned text to: {out_path}")
print("Preview:\n", cleaned[:800])

Preview:
 Main article content starts here.
Saved cleaned text to: /content/drive/MyDrive/tamil_data_cleaned.txt
Preview:
 ஒலிம்பிக் போட்டிகள் நடந்த இடங்கள் 1. 1896 - ஏதென்ஸ், கிரீஸ் 2. 1900 - பாரிஸ், பிரான்ஸ் 3. 1904 - செயின் லூயிஸ், அமெரிக்கா 4. 1908 - லண்டன்,பிரிட்டன் 5. 1912 - ஸ்டோக்ஹோம், சுவீடன் 6. 1920 - ஆண்ட்வெர்ப், பெல்ஜியம் 7. 1924 - பாரிஸ், பிரான்ஸ் 8. 1928 - ஆம்ஸ்டர்டாம், ஹாலந்து 9. 1932 - லாஸ், ஏஞ்சல்ஸ் 10. 1936 - பெர்லின், ஜெர்மனி 11. 1948 - லண்டன், இங்கிலாந்து 12. 1952 - ஹல்சின்கி, பின்லாந்து 13. 1956 - மேபோர்ன்,ஆஸ்திரேலியா 14. 1960 - ரோம், இத்தாலி 15. 1964 - டோக்கியோ, ஜப்பான் 16. 1968 - மெக்சிகோ, மெக்ஸிக்கோ 17. 1972 - மியூனிக், ஜெர்மனி 18. 1976 - மான்ட்ரியல், கனடா 19. 1980 - மாஸ்கோ, USSR 20. 1984 - லாஸ் ஏஞ்சல்ஸ், அமெரிக்கா 21. 1988 - சியோல், தென் கொரியா 22. 1992 - பார்சிலோனா, ஸ்பெயின் 23. 1996 - அட்லாண்டா, அமெரிக்கா 24. 2000 - சிட்னி, ஆஸ்திரேலியா 25. 2004 - ஏதென்ஸ், கிரீஸ் 26. 2008 - பீஜிங், சீன


In [None]:
import re

def remove_non_tamil_content(text):
    # Allow Tamil (\u0B80-\u0BFF), English letters, digits, whitespace, and punctuation
    pattern = r'[^\u0B80-\u0BFFa-zA-Z0-9\s.,;:"\'()\-\[\]\n!?%/]'
    cleaned = re.sub(pattern, '', text)

    # Collapse multiple spaces
    cleaned = re.sub(r'[ \t]+', ' ', cleaned)

    return cleaned.strip()
print(remove_non_tamil_content(raw_text))

இது ஒரு சோதனை வரி.
திருக்குறள்.
இது மற்றொரு வரி.
https://pandianeducationaltrust.com/-chenkaantal.html.
. Copyright line
இது இறுதி வரி.


In [None]:
import re

def remove_numbers_except_years(text):
    # Remove any number that is not a 4-digit year (1900–2099)
    text = re.sub(r'\b(?!19\d{2}\b|20\d{2}\b)\d+\b', '', text)

    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()
sample = """
1995 இல், ஏதோ நடந்தது.

தரவுத்தொகுப்பில் 123 மாதிரிகள் உள்ளன.

அடுத்த மதிப்பாய்வு 2024 இல்.

எண் 56 நீக்கப்பட வேண்டும்.
"""

print(remove_numbers_except_years(sample))

1995 இல், ஏதோ நடந்தது. தரவுத்தொகுப்பில் மாதிரிகள் உள்ளன. அடுத்த மதிப்பாய்வு 2024 இல். எண் நீக்கப்பட வேண்டும்.


In [None]:
import stanza
stanza.download('ta')
nlp_ta = stanza.Pipeline('ta', processors='tokenize,pos')

def process_nlp(text, lang='ta'):
    nlp = nlp_ta
    doc = nlp(text)
    sentences = []
    for sent in doc.sentences:
        tokens = []
        for w in sent.words:
            tokens.append({
                'text': w.text,
                'pos': getattr(w, 'upos', getattr(w, 'xpos', None)),
                'ner': 'O'
            })
        sentences.append(tokens)
    return sentences

text = "முருகன் தமிழ் நாட்டில் வழிபடப்படுகிறார். Murugan is worshipped in Tamil Nadu."
result = process_nlp(text)

for i, sent in enumerate(result, 1):
    print(f"\nSentence {i}")
    for tok in sent:
        print(f"{tok['text']:15}  POS={tok['pos']:<6}  NER={tok['ner']}")



Sentence 1
முருகன்          POS=PROPN   NER=O
தமிழ்            POS=PROPN   NER=O
நாட்டில்         POS=NOUN    NER=O
வழிபடப்படுகிறார்  POS=VERB    NER=O
.                POS=PUNCT   NER=O

Sentence 2
Murugan          POS=PROPN   NER=O
is               POS=PROPN   NER=O
worshipped       POS=PROPN   NER=O
in               POS=PROPN   NER=O
Tamil            POS=PROPN   NER=O
Nadu             POS=NOUN    NER=O
.                POS=PUNCT   NER=O


In [None]:
import re

def rule_based_ner(sentences):
    # Known literature entities + some Tamil-script variants
    literature_entities = {
        "LITERATURE": [
            r"\bThirukkural\b", r"திருக்குறள்",
            r"\bTolkappiyam\b", r"\bTholkappiya Porulathikaram\b",
            r"\bAkananooru\b", r"அகநானூறு"
        ]
    }

    for sentence in sentences:
        for token in sentence:
            for label, patterns in literature_entities.items():
                for pat in patterns:
                    if re.search(pat, token['text'], flags=re.IGNORECASE):
                        token['ner'] = label
    return sentences
sentences = [
    [
        {"text": "Thirukkural", "pos": "NOUN", "ner": "O"},
        {"text": "is", "pos": "AUX", "ner": "O"},
        {"text": "famous", "pos": "ADJ", "ner": "O"}
    ],
    [
        {"text": "அகநானூறு", "pos": "NOUN", "ner": "O"},
        {"text": "poem", "pos": "NOUN", "ner": "O"}
    ]
]

result = rule_based_ner(sentences)
for sent in result:
    print(sent)

[{'text': 'Thirukkural', 'pos': 'NOUN', 'ner': 'LITERATURE'}, {'text': 'is', 'pos': 'AUX', 'ner': 'O'}, {'text': 'famous', 'pos': 'ADJ', 'ner': 'O'}]
[{'text': 'அகநானூறு', 'pos': 'NOUN', 'ner': 'LITERATURE'}, {'text': 'poem', 'pos': 'NOUN', 'ner': 'O'}]


In [None]:
def entity_linking(sentences):
    # Knowledge base: canonical name -> description
    knowledge_base = {
        'Thirukkural': 'A classic Tamil text by Thiruvalluvar',
        'Tolkappiyam': 'An ancient Tamil grammar and literature text',
        'Tholkappiya Porulathikaram': 'A section of Tolkappiyam on poetics',
        'Akananooru': 'A classical Tamil poetic work'
    }

    # Aliases mapping to canonical names
    aliases = {
        'திருக்குறள்': 'Thirukkural',
        'tolkappiyam': 'Tolkappiyam',
        'tholkappiya porulathikaram': 'Tholkappiya Porulathikaram',
        'akananooru': 'Akananooru',
        'அகநானூறு': 'Akananooru'
    }

    for sentence in sentences:
        for token in sentence:
            if token.get('ner') == 'LITERATURE':
                text_norm = token['text'].strip()
                # Try direct match
                if text_norm in knowledge_base:
                    token['entity_link'] = knowledge_base[text_norm]
                # Try alias match (case-insensitive)
                elif text_norm.lower() in aliases:
                    canonical = aliases[text_norm.lower()]
                    token['entity_link'] = knowledge_base[canonical]
                else:
                    token['entity_link'] = "Unknown literature reference"
    return sentences
sentences = [
    [
        {"text": "திருக்குறள்", "pos": "NOUN", "ner": "LITERATURE"},
        {"text": "is", "pos": "AUX", "ner": "O"}
    ],
    [
        {"text": "Akananooru", "pos": "NOUN", "ner": "LITERATURE"}
    ]
]

linked = entity_linking(sentences)
for sent in linked:
    for tok in sent:
        print(tok)


{'text': 'திருக்குறள்', 'pos': 'NOUN', 'ner': 'LITERATURE', 'entity_link': 'A classic Tamil text by Thiruvalluvar'}
{'text': 'is', 'pos': 'AUX', 'ner': 'O'}
{'text': 'Akananooru', 'pos': 'NOUN', 'ner': 'LITERATURE', 'entity_link': 'A classical Tamil poetic work'}


In [None]:
def deduplicate_sentences(sentences, case_insensitive=True, return_report=False):
    seen = set()
    deduped = []
    removed = 0

    for sentence in sentences:
        sent_text = ' '.join([tok['text'] for tok in sentence]).strip()
        if case_insensitive:
            sent_text = sent_text.lower()
        sent_text = ' '.join(sent_text.split())  # normalize whitespace

        if sent_text not in seen:
            seen.add(sent_text)
            deduped.append(sentence)
        else:
            removed += 1

    if return_report:
        return deduped, {"kept": len(deduped), "removed": removed, "total": len(sentences)}
    return deduped
sentences = [
    [{"text": "Murugan", "pos": "PROPN", "ner": "O"}],
    [{"text": "Murugan", "pos": "PROPN", "ner": "O"}],
    [{"text": "murugan", "pos": "PROPN", "ner": "O"}],
    [{"text": "Tamil", "pos": "PROPN", "ner": "O"}]
]

deduped, stats = deduplicate_sentences(sentences, return_report=True)
print("Stats:", stats)
for s in deduped:
    print([t['text'] for t in s])

Stats: {'kept': 2, 'removed': 2, 'total': 4}
['Murugan']
['Tamil']


In [None]:
def nlp_pipeline(text):
    text = remove_boilerplate(text)
    text = remove_non_tamil_content(text)
    text = remove_numbers_except_years(text)
    sentences = process_nlp(text, lang='ta')
    sentences = rule_based_ner(sentences)
    sentences = entity_linking(sentences)
    sentences = deduplicate_sentences(sentences)
    return sentences
sample = "முருகன் தமிழ் நாட்டில் வழிபடப்படுகிறார். Thirukkural is a classic text."
result = nlp_pipeline(sample)
print(result)

[[{'text': 'முருகன்', 'pos': 'PROPN', 'ner': 'O'}, {'text': 'தமிழ்', 'pos': 'PROPN', 'ner': 'O'}, {'text': 'நாட்டில்', 'pos': 'NOUN', 'ner': 'O'}, {'text': 'வழிபடப்படுகிறார்', 'pos': 'VERB', 'ner': 'O'}, {'text': '.', 'pos': 'PUNCT', 'ner': 'O'}], [{'text': 'Thirukkural', 'pos': 'PROPN', 'ner': 'LITERATURE', 'entity_link': 'A classic Tamil text by Thiruvalluvar'}, {'text': 'is', 'pos': 'PROPN', 'ner': 'O'}, {'text': 'a', 'pos': 'NOUN', 'ner': 'O'}, {'text': 'classic', 'pos': 'NOUN', 'ner': 'O'}, {'text': 'text', 'pos': 'NOUN', 'ner': 'O'}, {'text': '.', 'pos': 'PUNCT', 'ner': 'O'}]]


In [None]:
import stanza

# Initialize Tamil pipeline (POS only, since NER is not available)
stanza.download('ta')
nlp_tamil = stanza.Pipeline('ta', processors='tokenize,pos')

def process_nlp(text, lang='ta'):
    doc = nlp_tamil(text)
    sentences = []
    for sent in doc.sentences:
        tokens = []
        for w in sent.words:
            tokens.append({
                'text': w.text,
                'pos': w.upos,
                'ner': 'O'
            })
        sentences.append(tokens)
    return sentences
text = "முருகன் தமிழ் நாட்டில் வழிபடப்படுகிறார்."
result = process_nlp(text)

for sent in result:
    for tok in sent:
        print(f"{tok['text']:15} POS={tok['pos']} NER={tok['ner']}")

முருகன்         POS=PROPN NER=O
தமிழ்           POS=PROPN NER=O
நாட்டில்        POS=NOUN NER=O
வழிபடப்படுகிறார் POS=VERB NER=O
.               POS=PUNCT NER=O


In [None]:
with open(out_path, 'r', encoding='utf-8') as f:
    pdf_text = f.read()

In [None]:
import stanza
import re

# 1. Init Tamil Stanza pipeline (POS only)
stanza.download('ta')
nlp_tamil = stanza.Pipeline('ta', processors='tokenize,pos')

# 2. Define your helpers (short versions here)
def remove_boilerplate(text):
    return re.sub(r'Bi-Yearly Peer-Reviewed Tamil Journal', '', text)

def remove_non_tamil_content(text):
    return re.sub(r'[^\u0B80-\u0BFFa-zA-Z0-9\s.,;:"\'()\-\[\]\n!?%/]', '', text)

def remove_numbers_except_years(text):
    return re.sub(r'\b(?!19\d{2}\b|20\d{2}\b)\d+\b', '', text)

def process_nlp(text, lang='ta'):
    doc = nlp_tamil(text)
    sentences = []
    for sent in doc.sentences:
        tokens = []
        for w in sent.words:
            tokens.append({
                'text': w.text,
                'pos': w.upos,
                'ner': 'O'   # default since Tamil NER is missing
            })
        sentences.append(tokens)
    return sentences

def rule_based_ner(sentences):
    for sentence in sentences:
        for token in sentence:
            if token['text'] in ['Thirukkural','Tolkappiyam','Akananooru']:
                token['ner'] = 'LITERATURE'
    return sentences

def entity_linking(sentences):
    kb = {'Thirukkural':'Classic text by Thiruvalluvar'}
    for sentence in sentences:
        for token in sentence:
            if token['ner']=='LITERATURE' and token['text'] in kb:
                token['entity_link'] = kb[token['text']]
    return sentences

def deduplicate_sentences(sentences):
    seen, deduped = set(), []
    for s in sentences:
        st = ' '.join([t['text'] for t in s])
        if st not in seen:
            seen.add(st)
            deduped.append(s)
    return deduped

# 3. The orchestrator
def nlp_pipeline(text):
    text = remove_boilerplate(text)
    text = remove_non_tamil_content(text)
    text = remove_numbers_except_years(text)
    sentences = process_nlp(text, lang='ta')
    sentences = rule_based_ner(sentences)
    sentences = entity_linking(sentences)
    sentences = deduplicate_sentences(sentences)
    return sentences

In [None]:
pdf_text = "முருகன் தமிழ் நாட்டில் வழிபடப்படுகிறார். Thirukkural is a classic text."
processed_sentences = nlp_pipeline(pdf_text)
for i, sent in enumerate(processed_sentences, 1):
    print(f"\nSentence {i}")
    for tok in sent:
        print(tok)


Sentence 1
{'text': 'முருகன்', 'pos': 'PROPN', 'ner': 'O'}
{'text': 'தமிழ்', 'pos': 'PROPN', 'ner': 'O'}
{'text': 'நாட்டில்', 'pos': 'NOUN', 'ner': 'O'}
{'text': 'வழிபடப்படுகிறார்', 'pos': 'VERB', 'ner': 'O'}
{'text': '.', 'pos': 'PUNCT', 'ner': 'O'}

Sentence 2
{'text': 'Thirukkural', 'pos': 'PROPN', 'ner': 'LITERATURE', 'entity_link': 'Classic text by Thiruvalluvar'}
{'text': 'is', 'pos': 'PROPN', 'ner': 'O'}
{'text': 'a', 'pos': 'NOUN', 'ner': 'O'}
{'text': 'classic', 'pos': 'NOUN', 'ner': 'O'}
{'text': 'text', 'pos': 'NOUN', 'ner': 'O'}
{'text': '.', 'pos': 'PUNCT', 'ner': 'O'}


In [None]:
for i, sentence in enumerate(processed_sentences):
    print(f"Sentence {i+1}:")
    for token in sentence:
        print(f"Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}")
    print()

Sentence 1:
Token: முருகன், POS: PROPN, NER: O, Entity Link: None
Token: தமிழ், POS: PROPN, NER: O, Entity Link: None
Token: நாட்டில், POS: NOUN, NER: O, Entity Link: None
Token: வழிபடப்படுகிறார், POS: VERB, NER: O, Entity Link: None
Token: ., POS: PUNCT, NER: O, Entity Link: None

Sentence 2:
Token: Thirukkural, POS: PROPN, NER: LITERATURE, Entity Link: Classic text by Thiruvalluvar
Token: is, POS: PROPN, NER: O, Entity Link: None
Token: a, POS: NOUN, NER: O, Entity Link: None
Token: classic, POS: NOUN, NER: O, Entity Link: None
Token: text, POS: NOUN, NER: O, Entity Link: None
Token: ., POS: PUNCT, NER: O, Entity Link: None



In [None]:
with open(out_path, 'r', encoding='utf-8') as f:
    pdf_text = f.read()

In [None]:
# Display Results
for idx, sentence in enumerate(processed_sentences, 1):
    print(f"Sentence {idx}:")
    for token in sentence:
        print(f"  Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}")
    print()
processed_data = nlp_pipeline(pdf_text)

Sentence 1:
  Token: முருகன், POS: PROPN, NER: O, Entity Link: None
  Token: தமிழ், POS: PROPN, NER: O, Entity Link: None
  Token: நாட்டில், POS: NOUN, NER: O, Entity Link: None
  Token: வழிபடப்படுகிறார், POS: VERB, NER: O, Entity Link: None
  Token: ., POS: PUNCT, NER: O, Entity Link: None

Sentence 2:
  Token: Thirukkural, POS: PROPN, NER: LITERATURE, Entity Link: Classic text by Thiruvalluvar
  Token: is, POS: PROPN, NER: O, Entity Link: None
  Token: a, POS: NOUN, NER: O, Entity Link: None
  Token: classic, POS: NOUN, NER: O, Entity Link: None
  Token: text, POS: NOUN, NER: O, Entity Link: None
  Token: ., POS: PUNCT, NER: O, Entity Link: None



In [None]:
with open('nlp_output.txt', 'w', encoding='utf-8') as output_file:
    for idx, sentence in enumerate(processed_data, 1):
        output_file.write(f"Sentence {idx}:\n")
        for token in sentence:
            output_file.write(f"  Token: {token['text']}, POS: {token['pos']}, NER: {token['ner']}, Entity Link: {token.get('entity_link', 'None')}\n")
        output_file.write("\n")

In [None]:
!pip install indic-nlp-library pdfplumber wikipedia-api --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m41.0/42.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m862.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m79.9 MB/s[0m

In [None]:
!python -m indicnlp.resources.manager download_resources

/usr/bin/python3: Error while finding module specification for 'indicnlp.resources.manager' (ModuleNotFoundError: No module named 'indicnlp.resources')


In [None]:
import pdfplumber
import re, wikipediaapi
from indicnlp import loader
from indicnlp.tokenize import sentence_tokenize, indic_tokenize

In [None]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126 (from 1)[K
Receiving objects: 100% (139/139), 149.77 MiB | 22.01 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.


In [None]:
INDIC_NLP_LIB_HOME = '/usr/local/lib/python3.11/dist-packages/indicnlp'
INDIC_NLP_RESOURCES = '/content/indic_nlp_resources'

In [None]:
PRONOUNS = {"நான்","நீ","அவர்","அவர்கள்","இது","அது"}
NOUN_SUFFIXES = ["ஐ","க்கு","இல்","இன்","உடைய","ஆல்","இருந்து","கள்"]
VERB_SUFFIXES = ["த்தான்","த்தாள்","த்","கிறார்","கிறார்கள்","கிறேன்","வது"]

In [None]:
GAZETTEER_PERSONS = {'அருள்மொழி', 'வந்தியத்தேவன்', 'நந்தினி'}
GAZETTEER_PLACES = {'தஞ்சாவூர்', 'காஞ்சிபுரம்'}
GAZETTEER_PERSONS = {"திரு","திருமதி","மதி"}
GAZETTEER_PLACES  = {"சென்னை","மதுரை","காஞ்சிபுரம்","கோவை","காஞ்சி"}

In [None]:
import re

BOILERPLATE_PATTERNS = [
    r"Project Madurai.*?$",
    r"Etext.*?$",
    r"www\.\S+",
    r"https?://\S+",
    r"Vol[-– ]\s*\w+",
    r"^\s*\d+\s*$",
    r"Page\s*\d+",
]

def remove_boilerplate(text):
    for pattern in BOILERPLATE_PATTERNS:
        text = re.sub(pattern, "", text, flags=re.MULTILINE | re.IGNORECASE)
    # collapse multiple blank lines
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    return text.strip()
sample = """
Project Madurai Digital Library
Etext prepared by XYZ
www.example.com
Vol- II
12

திருக்குறள்
Page 34
"""

print(remove_boilerplate(sample))

திருக்குறள்


In [None]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'

In [None]:
import pdfplumber

def extract_text(pdf_path, as_list=False):
    texts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                # strip trailing spaces, normalize newlines
                cleaned = page_text.strip()
                texts.append(cleaned)

    if as_list:
        return texts
    return "\n".join(texts)
pdf_text = extract_text("/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf")
print(pdf_text[:1000])   # show first 1000 characters

pages = extract_text("/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf", as_list=True)
print("Number of pages extracted:", len(pages))
print("First page preview:\n", pages[0][:500])

«`ˆ(cid:247) ‚ł‚¢´¢(cid:253)
ƒ(cid:192)¡(cid:253)(cid:201)¢´¢(cid:253) ƒ”ł¯(cid:253)
(cid:192)¡‚ı - 1« - (cid:210)— ƒ¯ß˙ı
(«(cid:242)(cid:190)¢´¡´(cid:237)‚ß 1- 30)
kalkiyin2
ponniyin celvan
part-1a putu veLLam (chapters 1- 30)
in tamil script, TSCII format
Etext in Tamil Script - TSCII format (v. 1.7)
Etext preparation: Mr. Bhaskaran Sankaran and colleagues of Anna University - KBC
Research Center, MIT - Chrompet Campus, Chennai, India.
Proof-reading: Mr. S. Anbumani, Mr. N.D. Logasundaram, Mr. Narayanan Govindarajan,
Ms. Pavithra Srinivasan, Mr. Ramachandran Mahadevan, Ms. Sathya, Mr. Sreeram
Krishnamoorthy, Dr. Sridhar Rathinam, Mrs. Srilatha Rajagopal, Mr. Vinoth Jagannathan
Etext prep. in html/web version: Mr. S. Anbumani, Blacksburg, Virginia, USA
Etext prep in pdf format: Dr. K. Kalyanasundaram, Lausanne, Switzerland
This pdf file is based on TSCInaimathi font embedded in the file. Hence this file can be
viewed and printed on all computer platforms: Windows, Macintosh and Unix
w

In [None]:
import re

BOILERPLATE_PATTERNS = [
    r"Project Madurai.*?$",
    r"Etext.*?$",
    r"www\.\S+",
    r"https?://\S+",
    r"Vol[-– ]\s*\w+",
    r"^\s*\d+\s*$"
]

def remove_boilerplate(text):
    for pattern in BOILERPLATE_PATTERNS:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
    # collapse multiple blank lines
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    return text.strip()
input_path = "/content/drive/MyDrive/tamil_data.txt"
output_path = "/content/drive/MyDrive/tamil_data_cleaned.txt"

with open(input_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = remove_boilerplate(raw_text)

with open(output_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned file saved at: {output_path}")
print("Preview:\n", cleaned_text[:500])  # first 500 chars


Cleaned file saved at: /content/drive/MyDrive/tamil_data_cleaned.txt
Preview:
 ஒலிம்பிக் போட்டிகள் நடந்த இடங்கள் 1. 1896 - ஏதென்ஸ், கிரீஸ் 2. 1900 - பாரிஸ், பிரான்ஸ் 3. 1904 - செயின் லூயிஸ், அமெரிக்கா 4. 1908 - லண்டன்,பிரிட்டன் 5. 1912 - ஸ்டோக்ஹோம், சுவீடன் 6. 1920 - ஆண்ட்வெர்ப், பெல்ஜியம் 7. 1924 - பாரிஸ், பிரான்ஸ் 8. 1928 - ஆம்ஸ்டர்டாம், ஹாலந்து 9. 1932 - லாஸ், ஏஞ்சல்ஸ் 10. 1936 - பெர்லின், ஜெர்மனி 11. 1948 - லண்டன், இங்கிலாந்து 12. 1952 - ஹல்சின்கி, பின்லாந்து 13. 1956 - மேபோர்ன்,ஆஸ்திரேலியா 14. 1960 - ரோம், இத்தாலி 15. 1964 - டோக்கியோ, ஜப்பான் 16. 1968 - மெக்சிகோ, மெக்


In [None]:
import re

def remove_non_tamil(text):
    return re.sub(r"[^ \t\n\r.,;:!?()\-\—–\"'“”‘’0-9\u0B80-\u0BFF]", "", text)

In [None]:
def remove_numbers_but_keep_years(text):
    return ' '.join([w for w in text.split() if not (w.isdigit() and not 1000 <= int(w) <= 2100)])

In [None]:
NOUN_SUFFIXES, VERB_SUFFIXES, PRONOUNS = ['க்கு', 'இல்'], ['கிறான்'], ['நான்', 'நீ']

In [None]:
import re

# Example lexicons (you can expand)
PRONOUNS = {"நான்", "நீ", "அவர்", "அவள்", "அது", "நாம்"}
VERB_SUFFIXES = ["கிறான்", "கிறாள்", "கிறேன்", "கிறோம்", "த்தான்", "வான்"]
NOUN_SUFFIXES = ["ம்", "த்தில்", "ங்கள்", "னால்"]

def pos_tag(token):
    token = token.strip()
    token = re.sub(r'[^\u0B80-\u0BFF]', '', token)   # keep only Tamil chars

    if token in PRONOUNS:
        return 'PRONOUN'
    if any(token.endswith(suf) for suf in VERB_SUFFIXES):
        return 'VERB'
    if any(token.endswith(suf) for suf in NOUN_SUFFIXES):
        return 'NOUN'
    return 'OTHER'

words = ["நான்", "நூலகத்தில்", "படிக்கிறான்", "தமிழ்"]
for w in words:
    print(w, "->", pos_tag(w))

நான் -> PRONOUN
நூலகத்தில் -> NOUN
படிக்கிறான் -> VERB
தமிழ் -> OTHER


In [None]:
import re

# Example gazetteers (expand as needed)
GAZETTEER_PERSONS = {"திருவள்ளுவர்", "கம்பர்", "இளங்கோ"}
GAZETTEER_PLACES  = {"சென்னை", "மதுரை", "திருச்சி", "காஞ்சிபுரம்"}

def normalize_token(token):
    return re.sub(r'[^\u0B80-\u0BFFA-Za-z]', '', token)

def ner_tag(token: str, next_token: str = None) -> str:
    token_norm = normalize_token(token)
    next_norm  = normalize_token(next_token) if next_token else None

    if token_norm in GAZETTEER_PERSONS:
        return "PERSON"
    if token_norm in GAZETTEER_PLACES:
        return "LOCATION"
    if next_norm in {"நகரம்", "மாவட்டம்"}:
        return "POSSIBLE_LOCATION"
    return "O"
tokens = ["திருவள்ளுவர்", "சென்னை", "நகரம்", "தமிழ்"]
for i, tok in enumerate(tokens):
    nxt = tokens[i+1] if i+1 < len(tokens) else None
    print(tok, "->", ner_tag(tok, nxt))


திருவள்ளுவர் -> PERSON
சென்னை -> LOCATION
நகரம் -> O
தமிழ் -> O


In [None]:
wiki_ta = wikipediaapi.Wikipedia(user_agent='MyTamilNLPApp/1.0 (https://example.com/myappinfo)', language='ta')

In [None]:
import wikipediaapi

wiki_ta = wikipediaapi.Wikipedia(
    user_agent="TamilNLPBot/1.0 (https://github.com/yourname; contact@example.com)",
    language="ta"
)

page = wiki_ta.page("திருக்குறள்")
print("Page Title:", page.title)
print("Summary:", page.summary[:500])  # first 500 chars

Page Title: திருக்குறள்
Summary: திருக்குறள் (Tirukkural), சுருக்கமாக குறள் (Kural), ஒரு தொன்மையான தமிழ் மொழி அற இலக்கியமாகும். சங்க இலக்கிய வகைப்பாட்டில் பதினெண்கீழ்க்கணக்கு எனப்படும் பதினெட்டு நூல்களின் திரட்டில் இருக்கும் இந்நூல் குறள் வெண்பா என்னும் பாவகையினாலான 1,330 ஈரடிச் செய்யுள்களைக் கொண்டது. இந்நூல் முறையே அறம், பொருள், இன்பம் ஆகிய மூன்று தொகுப்புகளைக் கொண்டது. இது அடிப்படையில் ஒரு வாழ்வியல் நூல் ஆகும். மாந்தர்கள் தம் அகவாழ்விலும் புற வாழ்விலும் நலமுடன் வாழ்வதற்குத் தேவையான அடிப்படைப் பண்புகளை விளக்குகிறது. இந்நூல் அற


In [None]:
def entity_link(entity):
    entity = entity.strip()
    page = wiki_ta.page(entity)

    if page.exists():
        return {
            "title": page.title,
            "url": page.fullurl,
            "summary": page.summary[:200] + "..." if page.summary else ""
        }
    return None
print(entity_link("திருவள்ளுவர்"))
print(entity_link("மதுரை"))

{'title': 'திருவள்ளுவர்', 'url': 'https://ta.wikipedia.org/wiki/%E0%AE%A4%E0%AE%BF%E0%AE%B0%E0%AF%81%E0%AE%B5%E0%AE%B3%E0%AF%8D%E0%AE%B3%E0%AF%81%E0%AE%B5%E0%AE%B0%E0%AF%8D', 'summary': 'திருவள்ளுவர் (ஆங்கிலம்: Thiruvalluvar) (சுருக்கமாக வள்ளுவர்), பழந்தமிழ் இலக்கியமான திருக்குறளை இயற்றிய தமிழ்ப்புலவர் ஆவார். கடைச்சங்ககால புலவரான இவர் பொ.ஊ.மு 400க்கும் பொ.ஊ. 100க்கும் இடைப்பட்ட காலத்த...'}
{'title': 'மதுரை', 'url': 'https://ta.wikipedia.org/wiki/%E0%AE%AE%E0%AE%A4%E0%AF%81%E0%AE%B0%E0%AF%88', 'summary': 'மதுரை (Madurai) இந்தியாவின், தமிழ்நாடு மாநிலத்தில் அமைந்துள்ள ஒரு தொன்மையான நகரம் ஆகும். இது மதுரை மாவட்டத்தின் தலைநகர் ஆகும். தமிழ்நாட்டின் தூங்கா நகரம் என அழைக்கப்படுகின்றது தமிழ்நாட்டில் உள்ள பெருந...'}


In [None]:
def remove_numbers_but_keep_years(text):
    # Remove 1-2 digit numbers and 5+ digit numbers
    text = re.sub(r"(?<!\d)\d{1,2}(?!\d)", "", text)
    text = re.sub(r"(?<!\d)\d{5,}(?!\d)", "", text)
    return text

In [None]:
raw_text = extract_text(pdf_path)
print("Raw length:", len(raw_text))

text = remove_boilerplate(raw_text)
print("After boilerplate:", len(text))

text = remove_non_tamil(text)
print("After non-Tamil:", len(text))

Raw length: 933396
After boilerplate: 932776
After non-Tamil: 545182


In [None]:
import unicodedata, re
from indicnlp.tokenize import sentence_tokenize

In [None]:
def normalize_text(text):
    # Normalize Unicode (important for Tamil)
    text = unicodedata.normalize("NFC", text)
    # Remove invisible chars
    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
    # Convert non-breaking space to normal space
    text = text.replace("\u00a0", " ")
    return text.strip()

In [None]:
sentences = [s.strip() for s in text.split("।") if s.strip()]

In [None]:
def deduplicate_sentences(sent_list):
    return list(dict.fromkeys(sent_list))

In [None]:
text = extract_text(pdf_path)
text = remove_boilerplate(text)
text = normalize_text(text)
text = remove_non_tamil(text)
text = remove_numbers_but_keep_years(text)

sentences = sentence_tokenize.sentence_split(text, lang='ta')
sentences = deduplicate_sentences(sentences)


In [None]:
# Deduplication
def deduplicate_sentences(sentences):
    return list(dict.fromkeys(sentences))

In [None]:
#Sentence Tokenization
for sent in sentences:
    tokens = indic_tokenize.trivial_tokenize(sent, lang='ta')
    print(f"\nSentence: {sent.strip()}")
    for idx, tok in enumerate(tokens):
        pos = pos_tag(tok)
        next_tok = tokens[idx+1] if idx+1 < len(tokens) else None
        ner = ner_tag(tok, next_tok)
        link = entity_link(tok) if ner in ['PERSON', 'LOCATION'] else None
        print(f"{tok:15} | POS: {pos:8} | NER: {ner:10} | Link: {link}")
        print(sentences)

In [None]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
!pip install pdfplumber pytesseract indic-nlp-library wikipedia-api --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m


In [None]:
import pdfplumber
import pytesseract
import re, wikipediaapi
from PIL import Image
from indicnlp import loader
from indicnlp.tokenize import sentence_tokenize, indic_tokenize

In [None]:
INDIC_NLP_LIB_HOME = '/usr/local/lib/python3.11/dist-packages/indicnlp'
INDIC_NLP_RESOURCES = '/usr/local/lib/python3.11/dist-packages/indicnlp/resources'

In [None]:
GAZETTEER_PERSONS = {'அருள்மொழி', 'வந்தியத்தேவன்', 'நந்தினி'}
GAZETTEER_PLACES = {'தஞ்சாவூர்', 'காஞ்சிபுரம்'}

In [None]:
pdf_path = '/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf'

In [None]:
def extract_text_with_ocr(pdf_path='/content/drive/MyDrive/Journals/Ponniyan Selvan Part1.pdf',ocr_lang= "tam+eng",dpi= 300,min_text_len= 20,verbose = True):
    collected = []

    with pdfplumber.open(pdf_path) as pdf_obj:
        for page in pdf_obj.pages:
            # 1) Try text layer (tweak tolerances to capture more text)
            page_text = page.extract_text(x_tolerance=1.5, y_tolerance=1.5)
            if page_text and len(page_text.strip()) >= min_text_len:
                collected.append(page_text.strip())
                continue

            # 2) OCR fallback
            if verbose:
                print(f"Page {page.page_number}: no reliable text layer — using OCR")
            # Render to image for OCR
            page_image = page.to_image(resolution=dpi).original.convert("RGB")
            # Try to deskew/autorotate
            page_image = _deskew_if_needed(page_image, verbose=verbose)

            # Tesseract config: OEM 3 (LSTM), PSM 4 (block of text) is good for pages
            config = "--oem 3 --psm 4"
            try:
                ocr_text = pytesseract.image_to_string(page_image, lang=ocr_lang, config=config)
            except pytesseract.TesseractError as e:
                if verbose:
                    print(f"  ! Tesseract error on page {page.page_number}: {e}")
                ocr_text = ""

            collected.append(ocr_text.strip())

    return _normalize_text("\n\n".join([t for t in collected if t]))
result = nlp_pipeline(extract_text_with_ocr(pdf_path))

In [None]:
def remove_boilerplate(text: str) -> str:
    lines = [ln for ln in text.splitlines() if ln.strip() and not is_boilerplate_line(ln)]
    text = "\n".join(lines)
    return re.sub(r'\n\s*\n+', '\n\n', text).strip()
print(remove_boilerplate)

<function remove_boilerplate at 0x7f0fce7bfec0>


In [None]:
def is_boilerplate_line(line):
    s = line.strip()
    if not s:
        return True

    for p in BOILERPLATE_PATTERNS:
        if re.search(p, s, flags=re.IGNORECASE):
            return True
    return False
BOILERPLATE_PATTERNS = [
    r"Project Madurai.*$",
    r"Etext.*$",
    r"^\s*\d+\s*$",
]

print(is_boilerplate_line)

<function is_boilerplate_line at 0x7f0ec79e4fe0>


In [None]:
def remove_non_tamil_chars(text):
    out = []
    for ch in text:
        cp = ord(ch)
        if (TAMIL_BLOCK[0] <= cp <= TAMIL_BLOCK[1]) \
           or ch.isspace() \
           or ch in ".,?!;:–—-()[]{}\"'…":
            out.append(ch)
    cleaned = "".join(out)
    cleaned = re.sub(r"[ \t]+", " ", cleaned)
    cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
    return cleaned.strip()
print(remove_non_tamil_chars)

<function remove_non_tamil_chars at 0x7f0ec783f560>


In [None]:
def remove_numbers_but_keep_years(text):
    def keep_token(token):
        # Strip punctuation around token
        core = re.sub(r'[^\d]', '', token)
        if core.isdigit():
            year = int(core)
            return 1000 <= year <= 2100
        return True

    tokens = text.split()
    kept = [w for w in tokens if keep_token(w)]
    return ' '.join(kept)
print(remove_numbers_but_keep_years)

<function remove_numbers_but_keep_years at 0x7f0fce7e00e0>


In [None]:
NOUN_SUFFIXES, VERB_SUFFIXES, PRONOUNS = ['க்கு', 'இல்'], ['கிறான்'], ['நான்', 'நீ']

In [None]:
TAMIL_BLOCK = (0x0B80, 0x0BFF)
def remove_non_tamil_content(text):
    out = []
    for ch in text:
        cp = ord(ch)
        if (TAMIL_BLOCK[0] <= cp <= TAMIL_BLOCK[1]) \
           or ch.isspace() \
           or ch in ".,?!;:–—-()[]{}\"'…":
            out.append(ch)
    cleaned = "".join(out)
    cleaned = re.sub(r"[ \t]+", " ", cleaned)
    cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
    return cleaned.strip()
print(remove_non_tamil_content)

<function remove_non_tamil_content at 0x7f0ec783ef20>


In [None]:
def pos_tag(token):
    token = re.sub(r'[^\u0B80-\u0BFF]', '', token.strip())
    if not token:
        return 'OTHER'

    if token in PRONOUNS:
        return 'PRONOUN'
    if any(token.endswith(suf) for suf in VERB_SUFFIXES):
        return 'VERB'
    if any(token.endswith(suf) for suf in NOUN_SUFFIXES):
        return 'NOUN'
    return 'OTHER'
words = ["நான்", "நூலகத்தில்", "படிக்கிறான்", "தமிழ்", "அவர்கள்"]
for w in words:
    print(w, "->", pos_tag(w))

நான் -> PRONOUN
நூலகத்தில் -> OTHER
படிக்கிறான் -> VERB
தமிழ் -> OTHER
அவர்கள் -> OTHER


In [None]:
def normalize_token(token):
    # Keep only Tamil (\u0B80-\u0BFF), English letters (A-Za-z), and digits (0-9)
    return re.sub(r'[^\u0B80-\u0BFFA-Za-z0-9]', '', token)

def ner_tag(token: str, next_token: str = None) -> str:
    t = normalize_token(token)
    n = normalize_token(next_token) if next_token else None

    if t in GAZETTEER_PERSONS:
        return "PERSON"
    if t in GAZETTEER_PLACES:
        return "LOCATION"
    if n in {"நகரம்", "மாவட்டம்"}:
        return "POSSIBLE_LOCATION"
    return "O"
tokens = ["திருவள்ளுவர்", "சென்னை", "நகரம்", "தமிழ்"]
for i, tok in enumerate(tokens):
    nxt = tokens[i+1] if i+1 < len(tokens) else None
    print(tok, "->", ner_tag(tok, nxt))

திருவள்ளுவர் -> O
சென்னை -> POSSIBLE_LOCATION
நகரம் -> O
தமிழ் -> O


In [None]:
wiki_ta = wikipediaapi.Wikipedia(user_agent='MyTamilNLPApp/1.0 (https://example.com/myappinfo)', language='ta')

In [None]:
def entity_link(entity,fallback_to_en = True):
    entity = entity.strip()
    if not entity:
        return None

    try:
        page = wiki_ta.page(entity)
        if page.exists():
            return {
                "title": page.title,
                "url": page.fullurl,
                "summary": page.summary[:200] + "..." if page.summary else ""
            }

        if fallback_to_en:
            page = wiki_en.page(entity)
            if page.exists():
                return {
                    "title": page.title,
                    "url": page.fullurl,
                    "summary": page.summary[:200] + "..." if page.summary else ""
                }
    except Exception as e:
        return {"error": str(e)}

    return None
print(entity_link)

<function entity_link at 0x7f0fce7e07c0>


In [None]:
def deduplicate_sentences(sents):
  return list(dict.fromkeys(sents))

In [None]:
def nlp_pipeline(text):
    text = remove_boilerplate(text)
    text = remove_non_tamil_content(text)
    text = remove_numbers_but_keep_years(text)
    sentences = sentence_tokenize.sentence_split(text, lang='ta')
    sentences = deduplicate_sentences(sentences)
    return sentences

In [None]:
text = extract_text_with_ocr(pdf_path)
text = remove_boilerplate(text)
text = remove_non_tamil(text)
text = remove_numbers_but_keep_years(text)
sentences = sentence_tokenize.sentence_split(text, lang='ta')
sentences = deduplicate_sentences(sentences)


[]


In [None]:
for sent in sentences:
    tokens = indic_tokenize.trivial_tokenize(sent, lang='ta')
    print(f"\nSentence: {sent.strip()}")
    for idx, tok in enumerate(tokens):
        pos = pos_tag(tok)
        next_tok = tokens[idx+1] if idx+1 < len(tokens) else None
        ner = ner_tag(tok, next_tok)
        link = entity_link(tok) if ner in ['PERSON', 'LOCATION'] else None
        print(f"{tok:15} | POS: {pos:8} | NER: {ner:10} | Link: {link}")

In [None]:
!pip install langdetect --quiet

In [None]:
!pip install indic-nlp-library --quiet

In [None]:
from indicnlp.tokenize.sentence_tokenize import sentence_split
from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
#from indicnlp.morph.analyzer import Analyzer
from indicnlp import common
import unicodedata
import re
from langdetect import detect
import json

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
import stanza
stanza.download('ta')
nlp = stanza.Pipeline('ta', processors='tokenize,pos,lemma', use_gpu=False)

def lemmatize_tamil(text):
    doc = nlp(text)
    lemmas = []
    for sentence in doc.sentences:
        for word in sentence.words:
            lemmas.append(word.lemma)
    return lemmas

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ta/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |
| lemma     | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


In [None]:
text = "அவன் சிறந்த பாடல்கள் எழுதுகிறான்"
print(lemmatize_tamil(text))

['அவன்', 'சிறந்த', 'பாடு', 'எழுது']


In [None]:
INDICNLP_RESOURCES = "/path/to/indicnlp/resources"
common.set_resources_path(INDICNLP_RESOURCES)
#analyzer = Analyzer("ta", INDICNLP_RESOURCES + "/morph/ta/")

In [None]:
# Tamil range regex
tamil_range = re.compile(r'[\u0B80-\u0BFF]+')

In [None]:
# Tamil stopwords (example subset)
stopwords_ta = set(['ஒரு', 'என்று', 'இந்த', 'மற்றும்', 'இது'])

In [None]:
def clean_and_process(text):
    text = unicodedata.normalize('NFKC', text)
    if not tamil_range.search(text):
      return None
    try:
      if detect(text) != 'ta':
        return None
    except:
      return None
    sentences = sentence_split(text, lang='ta')
    cleaned_sentences = []

    for sent in sentences:
        # Word tokenize
        words = trivial_tokenize(sent)
        words = [w for w in words if w not in stopwords_ta]

        lemmatized = []
        for word in words:
            # Without Analyzer, just keep the word
            lemmatized.append(word)

        if lemmatized:
            cleaned_sentences.append(" ".join(lemmatized))

    return cleaned_sentences

In [None]:
input_text = "அவன் ஒரு சிறந்த வீரன். அவன் போரில் வெற்றி பெற்றான்."

In [None]:
output = clean_and_process(input_text)

In [None]:
print(output)

['அவன் சிறந்த வீரன் .', 'அவன் போரில் வெற்றி பெற்றான் .']
