In [1]:
from indicnlp.tokenize import sentence_tokenize, indic_tokenize

def hindi_tokenizer(text):
    sentences = sentence_tokenize.sentence_split(text, lang='hi')
    tokenized_sentences = []
    for sentence in sentences:
        words = list(indic_tokenize.trivial_tokenize(sentence))
        tokenized_sentences.append(words)
    return {
        "sentences": sentences,
        "tokenized_sentences": tokenized_sentences,
    }


hindi_text = "भारत एक विशाल देश है, इसकी राजधानी नई दिल्ली है। हिंदी यहाँ की मुख्य भाषाओं में से एक है।"
result = hindi_tokenizer(hindi_text)

print("Tokenized Sentences:")
for sentence in result['sentences']:
    print(sentence)

print("\nTokenized Words:")
for words in result['tokenized_sentences']:
    print(words)

Tokenized Sentences:
भारत एक विशाल देश है, इसकी राजधानी नई दिल्ली है।
हिंदी यहाँ की मुख्य भाषाओं में से एक है।

Tokenized Words:
['भारत', 'एक', 'विशाल', 'देश', 'है', ',', 'इसकी', 'राजधानी', 'नई', 'दिल्ली', 'है', '।']
['हिंदी', 'यहाँ', 'की', 'मुख्य', 'भाषाओं', 'में', 'से', 'एक', 'है', '।']


In [9]:
filename = "data//eng_news_2005_10K-sentences.txt"
with open(filename, "r") as file:
    text = file.read()

text

'The post has just arrived and in it a very nice surprise, the discovery that Jacques Seguela, one-time adviser to President Mitterrand, now close confidant of President and Madame Sarkozy (indeed he intoduced them), and something of a legend in French political communications, has dedicated his latest book to little old moi. With apologies for the missing accents here and in the French bits of the long posting which follows - the dedication to \'Le Pouvoir dans la Peau\' (Power in the skin) reads \'A Alastair Campbell, mon spin doctor prefere\' (three missing accents in one word - mes excuses sinceres). So what did I do for this honour, you are asking? Well, perhaps the fact that he asked me to read his book, and write a \'postface\' assessment both of his writing and of the issues he covers, and the fact that I said yes, has something to do with it. He says some blushmakingly kind things in his \'preface to the postface\', which I will have to leave to French readers of the whole thi

In [19]:
import re
from collections import Counter
from itertools import combinations
from itertools import tee
import emoji

emoji_sentiment_map = {
    "😊": "positive",
    "😄": "positive",
    "😂": "positive",
    "😍": "positive",
    "😢": "negative",
    "😡": "negative",
    "😔": "negative",
    "😎": "neutral",
    "😐": "neutral",
    # Add more emojis as needed
}


def english_tokenizer(text):
    # Define the regular expression pattern
    pattern = r"""
        (?<!\S)[a-zA-Z]+(?:'[a-z]+)?    # Words with optional contractions
        | [.,!?;:"'()]                  # Punctuation marks
        | [@#$%^&*<>/\|~`+=-_]          # Special characters
    """

    # Compile the pattern with verbose flag
    regex = re.compile(pattern, re.VERBOSE)
    # Find all matches
    tokens = regex.findall(text)
    unique_tokens = list(set(tokens))
    return unique_tokens

def generate_bigrams(tokens):
    a, b = tee(tokens)
    next(b, None)
    return list(zip(a, b))

def find_top_n_bigrams(tokens, n):
    bigrams = generate_bigrams(tokens)
    bigram_counts = Counter(bigrams)
    return bigram_counts.most_common(n)

def add_top_n_bigrams_to_vocab(unique_tokens, top_n_bigrams):
    bigram_tokens = [' '.join(bigram) for bigram, _ in top_n_bigrams]
    updated_vocab = unique_tokens + bigram_tokens
    return updated_vocab


tokens = english_tokenizer(text)
print("Total Unique Tokens: ", len(tokens))

top_n_bigrams = find_top_n_bigrams(tokens, n=3)
print("Top 3 Frequent Bigrams:", top_n_bigrams)

# Step 4: Add top N bigrams to the vocabulary
updated_vocab = add_top_n_bigrams_to_vocab(tokens, top_n_bigrams)
print("Updated Vocabulary:", updated_vocab)


Total Unique Tokens:  1702
Top 3 Frequent Bigrams: [(('fact', 'Pen'), 1), (('Pen', 'hiding'), 1), (('hiding', 'measure'), 1)]
Updated Vocabulary: ['fact', 'Pen', 'hiding', 'measure', 'Hazel', 'illuminates', 'strategist', 'more', 'largely', 'changing', 'traumatic', 'hugely', 'People', 'envie', 'violent', 'endorsed', 'agreed', 'Meanwhile', 'local', 'toy', 'she', 'parades', 'two', 'glib', 'Of', 'hard', 'spectacular', 'campaigns', 'authenticity', 'deeper', 'end', 'would', 'announced', 'rational', 'thick', 'live', "country's", 'became', 'retourner', 'it', 'enjoying', 'victories', 'before', 'minority', 'only', 'detailed', "wouldn't", 'planning', '"', 'refer', 'This', 'offers', 'resist', 'though', 'twice', 'Original', 'aside', 'start', 'insists', 'honour', 'debate', 'reasons', 'names', 'still', 'saying', 'Nick', 'learned', 'sure', 'agreement', 'poorest', 'changes', 'books', 'junkies', 'recognisable', 'prefere', 'new', 'bcbg', 'C', 'Unless', 'whenever', 'radically', 'giant', 'deep', 'out', "le