In [11]:
from datasets import load_dataset

# Directly stream the Hindi-Devanagari split
hindi_stream = load_dataset(
    "ai4bharat/IndicCorpV2",
    "indiccorp_v2",
    streaming=True,
    split="hin_Deva"
)



In [12]:
pip install datasets



In [13]:
def custom_tokenizer_hindi_with_matras(text):
    # Handle special items
    url_pattern = r'https?://[^\s]+|www\.[^\s]+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b'

    urls = re.findall(url_pattern, text)
    text = re.sub(url_pattern, '<URL>', text)

    emails = re.findall(email_pattern, text)
    text = re.sub(email_pattern, '<EMAIL>', text)

    tokens = []
    for word in text.split():
        if word == '<URL>':
            tokens.append(urls.pop(0))
        elif word == '<EMAIL>':
            tokens.append(emails.pop(0))
        else:
            # Tokenize Devanagari characters while separating matras
            word_tokens = re.findall(rf'[\u0915-\u0939][\u093e-\u094c\u0902\u0903]?|\d+|[^\s\w]', word)
            tokens.extend(word_tokens)

    return tokens


In [27]:
from itertools import islice

samples = list(islice(hindi_stream, 10))  # change 5 to any number


In [28]:
text = samples[0]['text']


In [29]:
import re

def hindi_tokenizer(text):
    url_pattern = r'https?://[^\s]+|www\.[^\s]+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b'

    # Save and replace URLs and emails
    urls = re.findall(url_pattern, text)
    text = re.sub(url_pattern, '<URL>', text)

    emails = re.findall(email_pattern, text)
    text = re.sub(email_pattern, '<EMAIL>', text)

    tokens = []
    for token in text.split():
        if token == '<URL>':
            tokens.append(urls.pop(0))
        elif token == '<EMAIL>':
            tokens.append(emails.pop(0))
        else:
            split_tokens = re.findall(
                r'[\u0900-\u097F]+|[a-zA-Z0-9]+|[।.,!?;:()\"\'\-]|[^\s]',
                token
            )
            tokens.extend(split_tokens)

    return tokens

def hindi_sentence_tokenizer(text):
    sentence_end_pattern = r'(?<=[।!?\.])\s+'
    sentences = re.split(sentence_end_pattern, text.strip())
    return [s.strip() for s in sentences if s.strip()]

def detokenize(tokens):
    # Rebuild sentence with proper spacing logic
    sentence = ''
    for i, token in enumerate(tokens):
        if i > 0 and not re.match(r'[।.,!?;:)\]\'\"]', token):
            sentence += ' '
        sentence += token
    return sentence.strip()

def hindi_corpus_statistics(text):
    sentences = hindi_sentence_tokenizer(text)
    all_tokens = []
    reconstructed_sentences = []

    for sentence in sentences:
        tokens = hindi_tokenizer(sentence)
        all_tokens.extend(tokens)

        # For checking reformation
        reconstructed = detokenize(tokens)
        reconstructed_sentences.append(reconstructed)

    num_tokens = len(all_tokens)
    unique_tokens = set(all_tokens)
    total_chars = sum(len(token) for token in all_tokens)

    word_tokens = [t for t in all_tokens if re.match(r'^[\u0900-\u097F\w]+$', t)]
    avg_word_length = sum(len(t) for t in word_tokens) / len(word_tokens) if word_tokens else 0
    type_token_ratio = len(unique_tokens) / num_tokens if num_tokens else 0

    return {
        'sentences': sentences,
        'tokens': all_tokens,
        'num_sentences':len(sentences),

        're_num_sentences':len(reconstructed_sentences),
        'num_tokens': num_tokens,
        'total_characters': total_chars,
        'average_word_length': round(avg_word_length, 2),
        'type_token_ratio': round(type_token_ratio, 3),
        'reconstructed_sentences': reconstructed_sentences
    }


In [30]:
text = ' '.join(sample['text'] for sample in samples)
stats = hindi_corpus_statistics(text)

print("Original Sentences:")
for sentence in stats['sentences']:
    print(sentence)

print("\nReconstructed Sentences:")
for sent in stats['reconstructed_sentences']:
    print(sent)

print("\nTokens:", stats['tokens'])
print("Number of sentences:",stats['num_sentences'])
print("Number of sentences:",stats['re_num_sentences'])
print("Number of Tokens:", stats['num_tokens'])
print("Total Characters:", stats['total_characters'])
print("Average Word Length:", stats['average_word_length'])
print("Type-Token Ratio:", stats['type_token_ratio'])


Original Sentences:
लोगों को बिलों संबंधी सुविधा देना ही उनका काम  इनेलो 1987 में उस वक्त ऐसे ही दोराहे पर खड़ी थी, जब पूर्व उपप्रधानमंत्री देवीलाल ने अपने पुत्र ओमप्रकाश चौटाला को अपना राजनीतिक उत्तराधिकारी घोषित किया था।
हालांकि तब पार्टी पर देवीलाल की मजबूत पकड़ के चलते पार्टी टूटने से बच गई थी।
1989 में देवीलाल केन्द्र की राजनीति में सक्रिय हो गए थे और उनके उपप्रधानमंत्री बनने के पश्चात् उनके तीन बेटों जगदीश सिंह, रणजीत सिंह और ओमप्रकाश चौटाला में से रणजीत और ओमप्रकाश के बीच हरियाणा में उनकी राजनीतिक विरासत को लेकर जंग शुरू हो गई थी।
उन परिस्थितियों में देवीलाल ने कड़ा निर्णय लेते हुए पार्टी की बागडोर ओमप्रकाश चौटाला के हवाले कर दी थी, जिसके बाद रणजीत की बगावत का असर पार्टी, संगठन और उनकी सरकार पर भी पड़ा था।
उस समय रणजीत की नाराजगी के चलते उनके समर्थन में कई कैबिनेट मंत्रियों ने इस्तीफे दे दिए थे किन्तु तब पार्टी सुप्रीमो चौ.
देवीलाल की हरियाणा की जनता पर इतनी मजबूत पकड़ थी कि ओमप्रकाश चौटाला को उत्तराधिकारी बनाने के उनके फैसले का जनता के बीच कोई खास विरोध नहीं हुआ था लेकिन आज स्थ

In [18]:
import re

def english_tokenizer(text):
    # Define patterns
    url_pattern = r'https?://[^\s]+|www\.[^\s]+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'

    # Save and replace URLs/emails
    urls = re.findall(url_pattern, text)
    text = re.sub(url_pattern, '<URL>', text)

    emails = re.findall(email_pattern, text)
    text = re.sub(email_pattern, '<EMAIL>', text)

    tokens = []
    for token in text.split():
        if token == '<URL>':
            tokens.append(urls.pop(0))
        elif token == '<EMAIL>':
            tokens.append(emails.pop(0))
        else:
            # Tokenize: words, numbers, punctuation
            split_tokens = re.findall(r"[a-zA-Z0-9]+|[.,!?;:'\"()\-]|[^\s]", token)
            tokens.extend(split_tokens)

    return tokens

def english_sentence_tokenizer(text):
    # Split on sentence-ending punctuation followed by space
    sentence_end_pattern = r'(?<=[.!?])\s+'
    sentences = re.split(sentence_end_pattern, text.strip())
    return [s.strip() for s in sentences if s.strip()]

def detokenize(tokens):
    sentence = ''
    for i, token in enumerate(tokens):
        if i > 0 and not re.match(r'[.,!?;:)\]\'\"]', token):
            sentence += ' '
        sentence += token
    return sentence.strip()

def english_corpus_statistics(text):
    sentences = english_sentence_tokenizer(text)
    all_tokens = []
    reconstructed_sentences = []

    for sentence in sentences:
        tokens = english_tokenizer(sentence)
        all_tokens.extend(tokens)

        reconstructed = detokenize(tokens)
        reconstructed_sentences.append(reconstructed)

    num_tokens = len(all_tokens)
    unique_tokens = set(all_tokens)
    total_chars = sum(len(token) for token in all_tokens)

    word_tokens = [t for t in all_tokens if re.match(r'^[a-zA-Z0-9]+$', t)]
    avg_word_length = sum(len(t) for t in word_tokens) / len(word_tokens) if word_tokens else 0
    type_token_ratio = len(unique_tokens) / num_tokens if num_tokens else 0

    return {
        'sentences': sentences,
        'tokens': all_tokens,
        'num_tokens': num_tokens,
        'total_characters': total_chars,
        'average_word_length': round(avg_word_length, 2),
        'type_token_ratio': round(type_token_ratio, 3),
        'reconstructed_sentences': reconstructed_sentences
    }


In [19]:
text = """Hello! I'm Alex. Feel free to drop a message at alex.jordan@gmail.com or visit my blog at www.alexwrites.com. Hope you find it interesting!"""

stats = english_corpus_statistics(text)

print("Original Sentences:")
for s in stats['sentences']:
    print(s)

print("\nReconstructed Sentences:")
for s in stats['reconstructed_sentences']:
    print(s)

print("\nTokens:", stats['tokens'])
print("Number of Tokens:", stats['num_tokens'])
print("Total Characters:", stats['total_characters'])
print("Average Word Length:", stats['average_word_length'])
print("Type-Token Ratio:", stats['type_token_ratio'])


Original Sentences:
Hello!
I'm Alex.
Feel free to drop a message at alex.jordan@gmail.com or visit my blog at www.alexwrites.com.
Hope you find it interesting!

Reconstructed Sentences:
Hello!
I' m Alex.
Feel free to drop a message at alex.jordan@gmail.com or visit my blog at www.alexwrites.com.
Hope you find it interesting!

Tokens: ['Hello', '!', 'I', "'", 'm', 'Alex', '.', 'Feel', 'free', 'to', 'drop', 'a', 'message', 'at', 'alex.jordan@gmail.com', 'or', 'visit', 'my', 'blog', 'at', 'www.alexwrites.com.', 'Hope', 'you', 'find', 'it', 'interesting', '!']
Number of Tokens: 27
Total Characters: 118
Average Word Length: 3.52
Type-Token Ratio: 0.926
