In [1]:
%pip install tqdm

Keyring is skipped due to an exception: 'keyring.backends'
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:

%pip install huggingface_hub

Keyring is skipped due to an exception: 'keyring.backends'
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install indic-nlp-library

Keyring is skipped due to an exception: 'keyring.backends'
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
os.environ["HF_TOKEN"] = ""


In [6]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from tqdm import tqdm
import unicodedata
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

In [7]:
df = pd.read_csv("hf://datasets/SuryaKrishna02/aya-telugu-news-articles/news_articles_dataset.csv")

In [8]:
df_articles = df[df["template_id"] == 1].copy()
df_articles = df_articles.dropna(subset=["targets"])
df_articles.rename(columns={"targets": "sentence"}, inplace=True)
print("Total Telugu articles:", len(df_articles))
df_articles.head()

Total Telugu articles: 233545


Unnamed: 0,inputs,sentence,template_id,template_lang
2,ఇవ్వబడిన హెడ్లైన్ తో వార్తా కథనాన్ని రాయండి:\n...,మధ్యప్రదేశ్లో 108 కొత్త కేసులను గుర్తించిన తర్...,1,['tel']
3,ఇచ్చిన టైటిల్ తో న్యూస్ కథనాన్ని రాయండి:\nఢిల్...,దేశంలో తాజాగా భిన్నమైన వాతావరణ పరిస్థితులు కొన...,1,['tel']
5,ఇచ్చిన టైటిల్ తో వార్తా కథనాన్ని వ్రాయండి:\n8న...,తిరుచానూరు సమీపంలోని శిల్పారామంలో 8న హోలి సంబర...,1,['tel']
10,ఇచ్చిన టైటిల్ తో వార్తా కథనాన్ని వ్రాయండి:\nఆగ...,దేశవ్యాప్తంగా విమాన సేవలు అందించేందుకు మరో కొత...,1,['tel']
11,ఇవ్వబడిన హెడ్లైన్ తో న్యూస్ కథనాన్ని రాయండి:\n...,బిజెపి నాయకులను ముందస్తుగా అరెస్టు చేసి మహ్మదా...,1,['tel']


In [9]:
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("te")

def normalize_telugu(text):
    return normalizer.normalize(str(text))

In [10]:
def remove_urls_emails(text):
    text = re.sub(r'http\S+|www\.\S+', '', str(text))
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    return text

In [11]:
def clean_punctuation(text):
    text = re.sub(r'[!?.]{2,}', '.', str(text))
    text = re.sub(r'–+', '-', text)
    return text

In [12]:
def remove_symbols(text):
    return re.sub(r'[^\u0C00-\u0C7FA-Za-z0-9\s.,!?;:()\'"-]', '', str(text))

In [13]:
def is_telugu_sentence_relaxed(text):
    total = len(text)
    tel_chars = len(re.findall(r'[\u0C00-\u0C7F]', str(text)))
    return (tel_chars / total) > 0.4 if total > 0 else False

In [14]:
def token_length(text):
    return len(re.findall(r'[\u0C00-\u0C7F]+', str(text)))

In [15]:
df_articles["sentence"] = df_articles["sentence"].apply(normalize_telugu)
df_articles["sentence"] = df_articles["sentence"].apply(remove_urls_emails)
df_articles["sentence"] = df_articles["sentence"].apply(clean_punctuation)
df_articles["sentence"] = df_articles["sentence"].apply(remove_symbols)

In [16]:
df_articles = df_articles[df_articles["sentence"].apply(is_telugu_sentence_relaxed)]

In [17]:
df_articles.drop_duplicates(subset=["sentence"], inplace=True)

In [18]:
df_articles["length"] = df_articles["sentence"].apply(token_length)
df_articles = df_articles[(df_articles["length"] > 2) & (df_articles["length"] < 200)]

In [19]:
df_articles["sentence"] = df_articles["sentence"].str.replace(r'\s+', ' ', regex=True).str.strip()

In [20]:
print(len(df_articles))

214222


In [21]:
def tokenize_telugu(text):
    return re.findall(r'[\u0C00-\u0C7F]+', str(text))

In [22]:
def utterance_length(sentence):
    return len(tokenize_telugu(sentence))
def mean_word_length(sentence):
    words = tokenize_telugu(sentence)
    return np.mean([len(w) for w in words]) if words else 0

In [23]:
word_freq = Counter()
for s in tqdm(df_articles["sentence"]):
    word_freq.update(tokenize_telugu(s))
print("\nUnique words:", len(word_freq))
def average_freq(sentence, freq_dict):
    words = tokenize_telugu(sentence)
    if not words:
        return 0
    return np.mean([freq_dict[w] for w in words if w in freq_dict])

100%|██████████| 214222/214222 [00:06<00:00, 33888.92it/s]


Unique words: 616554





In [24]:
def get_frames(sentence_list):
    frames = []
    for s in sentence_list:
        words = tokenize_telugu(s)
        frame = ' '.join(words[:3]) if len(words) >= 3 else ' '.join(words)
        frames.append(frame)
    return dict(Counter(frames))

def frame_freq(sentence, frame_dict):
    words = tokenize_telugu(sentence)
    frame = ' '.join(words[:3]) if len(words) >= 3 else ' '.join(words)
    return frame_dict.get(frame, 0)

In [25]:
frame_dict = get_frames(df_articles["sentence"])
scores = []

for s in tqdm(df_articles["sentence"]):
    frame_frequency = frame_freq(s, frame_dict)
    utt_length = utterance_length(s)
    mean_length = mean_word_length(s)
    mean_freq = average_freq(s, word_freq)
    scores.append({
        "sentence": s,
        "frame_freq": frame_frequency,
        "utterance_length": utt_length,
        "mean_word_length": mean_length,
        "mean_word_freq": mean_freq
    })

df_scores = pd.DataFrame(scores)
print(df_scores.shape)

100%|██████████| 214222/214222 [00:26<00:00, 8032.22it/s]


(214222, 5)


In [26]:
df_scores["frame_rank"] = df_scores["frame_freq"].rank(ascending=False)
df_scores["utterance_rank"] = df_scores["utterance_length"].rank(ascending=True)
df_scores["mean_length_rank"] = df_scores["mean_word_length"].rank(ascending=True)
df_scores["mean_freq_rank"] = df_scores["mean_word_freq"].rank(ascending=False)
df_scores["final_rank"] = (
    df_scores["frame_rank"] +
    df_scores["utterance_rank"] +
    df_scores["mean_length_rank"] +
    df_scores["mean_freq_rank"]
)
ordered_df = df_scores.sort_values(by="final_rank")
ordered_df.reset_index(drop=True, inplace=True)

In [27]:
import os
print(os.getcwd())


/nfs/storage1/home/pulipakv


In [28]:
ordered_df["sentence"].to_csv("telugu_curriculum.txt", index=False, header=False, encoding="utf-8")
print("Saved 'telugu_curriculum.txt'")

Saved 'telugu_curriculum.txt'


In [29]:
token_pattern = re.compile(r'[\u0C00-\u0C7F]+')
sentence_count = 0
token_count = 0

with open("telugu_curriculum.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        sentence_count += 1
        tokens = token_pattern.findall(line)
        token_count += len(tokens)

In [30]:
print(f"Total Sentences : {sentence_count:,}")
print(f"Total Tokens : {token_count:,}")

Total Sentences : 214,222
Total Tokens : 17,015,797


In [31]:
print("Easy Sentences:")
print(ordered_df["sentence"].head(5))

Easy Sentences:
0    తెలంగాణ సీఎం కేసీఆర్ కు అరుదైన ఆహ్వానం అందింది...
1    దేశంలో కరోనా కేసులు 15 వేల కన్నా తక్కువే నమోదయ...
2    ఏపీ సీఎం జగన్ పశ్చిమ గోదావరి జిల్లా పర్యటన వాయ...
3    ఈ నెల 17న తెలంగాణ రాష్ట్ర బంద్కు బీసీ సంఘం నేత...
4    భారత్ లో కరోనా కేసులు పెరుగుతున్నాయి. దేశంలో క...
Name: sentence, dtype: object


In [32]:

token_pattern = re.compile(r'[\u0C00-\u0C7F]+')
TARGET_TOKENS = 17_000_000   # 17M tokens
output_file = "telugu_17M_tokens.txt"
total_tokens = 0
saved_sentences = 0

with open(output_file, "w", encoding="utf-8") as out:
    for sentence in tqdm(ordered_df["sentence"], desc="Saving 17M tokens"):

        # tokenize using your exact Telugu-token regex
        tokens = token_pattern.findall(sentence)
        if len(tokens) == 0:
            continue

        # if adding this sentence crosses 17M → stop
        if total_tokens + len(tokens) > TARGET_TOKENS:
            break

        # write sentence
        out.write(sentence.strip() + "\n")

        total_tokens += len(tokens)
        saved_sentences += 1

Saving 17M tokens: 100%|█████████▉| 214128/214222 [00:04<00:00, 45128.57it/s]


In [33]:
print(f"Saved sentences : {saved_sentences:,}")
print(f"Saved tokens    : {total_tokens:,}")
print(f"Output file     : {output_file}")

Saved sentences : 214,128
Saved tokens    : 16,999,956
Output file     : telugu_17M_tokens.txt
