In [1]:
from codemix import cs_metrics
from codemix import codemix_viz as cv
from codemix import tokenize_pos_awesome_align as tpa
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 9.62MB/s]                    
2025-04-19 16:03:28 INFO: Downloaded file to /home/prashantk/stanza_resources/resources.json
2025-04-19 16:03:28 INFO: Downloading default packages for language: en (English) ...
2025-04-19 16:03:29 INFO: File exists: /home/prashantk/stanza_resources/en/default.zip
2025-04-19 16:03:32 INFO: Finished downloading models and saved to /home/prashantk/stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 34.8MB/s]                    
2025-04-19 16:03:32 INFO: Downloaded file to /home/prashantk/stanza_resources/resources.json
2025-04-19 16:03:32 INFO: Downloading default packages for language: hi (Hindi) ...
2025-04-19 16:03:32 INFO: File exists: /home/prashantk/stanza_resources/hi/default.zip
2025-04-19 16:03:33 INFO: 

In [2]:
tokens = ['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"']
LID_Tags = ['en', 'en', 'hi', 'hi', 'hi', 'hi', 'univ', 'univ', 'ne', 'univ', 'ne', 'univ']
PoS_Tags = ['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT']

lang_tags = ['hi', 'en']
other_tags = ['univ', 'ne', 'acro']

# CodeMixed Sentence Object

In [3]:
from codemix.cs_metrics import CodeMixSentence
codemixed_sentence = CodeMixSentence(lang_tagset = lang_tags, 
                                            other_tagset = other_tags, 
                                            l1 = 'en', 
                                            l2 = 'hi', 
                                            sentence = None, 
                                            tokens = tokens, 
                                            LID_Tags = LID_Tags, 
                                            PoS_Tags = PoS_Tags)

In [4]:
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=None,
    burstiness=None,
    i_index=None,
    lang_entropy=None,
    mindex=None,
    spavg=None,
    symcom_sentence=None
)


# CodeMix Metrics

In [5]:
codemixed_sentence.compute_all_metrics()
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=33.333333333333336,
    burstiness=-0.3592455179659185,
    i_index=0.2,
    lang_entropy=0.9182958340544896,
    mindex=0.7999999999999999,
    spavg=1,
    symcom_sentence=0.6666666666666666
)


# Visualization

In [6]:
annotation_printer = cv.AnnotatedTextPrinter()

In [7]:
annotation_printer.print_sample_st_annot_text(tokens, LID_Tags, PoS_Tags)

In [8]:
annotation_printer.export_html(file_name="codemix_visualization_test.html")

HTML file saved as codemix_visualization_test.html


# LID Tags

In [9]:
from codemix import tags
tags = tags.Tags()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
lid_tags = tags.analyze(sentence)
print("LID tags:", lid_tags)

LID tags: (['मैं', 'Hyderabaed', 'में', 'movie', 'देखने', 'जा', 'रहा', 'हूँ'], ['hi', 'ne', 'hi', 'en', 'hi', 'hi', 'hi', 'hi'])


# NER Tagger

In [10]:
from codemix.compute import NERtagger

ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER")
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

[False, 'ne', False, False, False, False, False, False]

In [11]:
ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER",
                       finegrain_labels=True)
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O']

# PoS Tagger

In [12]:
from codemix.compute import PoSTagger

postagger = PoSTagger(model_path="prakod/en-hi-pos-tagger-symcom", tokenizer_path="xlm-roberta-base")
postagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_pos_tags = postagger.predict_pos_sentence(sentence)
print("Predicted POS tags:", predicted_pos_tags)


Predicted POS tags: ['PRON', 'PROPN', 'ADP', 'NOUN', 'VERB', 'VERB', 'AUX', 'AUX']


# Unicode Based LID

In [13]:
from codemix.compute import UnicodeLIDtagger

lid_tagger = UnicodeLIDtagger()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
ner_predictions = [False, 'ne', False, False, False, False, False, False]
lid_tags = lid_tagger.get_unicode_lid_predictions(sentence,
                                      ner_predictions = ner_predictions)
print("LID tags:", lid_tags)



LID tags: (['मैं', 'Hyderabaed', 'में', 'movie', 'देखने', 'जा', 'रहा', 'हूँ'], ['hi', 'ne', 'hi', 'en', 'hi', 'hi', 'hi', 'hi'])


# Normalisation / Romanisation

In [14]:
from codemix.normalize import Normalizer
normalizer = Normalizer()

sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
normalizer.normalize_text(sentence)

Initializing Multilingual model for transliteration


  state = torch.load(f, map_location=torch.device("cpu"))
Loading dicts into RAM: 100%|██████████| 1/1 [00:00<00:00, 23.99it/s]


'main hyderabaed main movie dekhane jaa rahaa hoon'

# Rule Based Synthetic Code-Mixed Sentence Generation - NOUN, ADJ Replacement

In [15]:
filename = "unique_utterances_en_hi_transltions.json"
df_codemixed = tpa.get_codemix_candidates_for_file(filename)
df_codemixed

0it [00:00, ?it/s]2025-04-19 16:04:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 44.0MB/s]                    
2025-04-19 16:04:07 INFO: Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
2025-04-19 16:04:08 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:L

done





Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos,alignments_awesomealign,token_alignment_map_awesomealign,codemixed-sentences
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADV, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, DET, AUX, PUNCT]]",[0-0 0-1 1-2 2-5 3-4 4-6 5-3 6-4 7-7 ],"[{'Okay': 'है', 'ठीक': 'Okay', 'है': 'does', '...","Okay है , इसकी cost कितनी है ?"


# OLD / Prototype/ Testing - Delete Later

In [None]:
metrics = cs_metrics.CodeMixMetrics(lang_tags, other_tags)
symcom = cs_metrics.SyMCoM(LID_Tags, PoS_Tags, 'en', 'hi')
cm_sentence = cs_metrics.CodeMixSentence(sentence = None, tokens = tokens, LID_Tags = LID_Tags, PoS_Tags = PoS_Tags)

In [None]:
cmi_score = metrics.cmi(LID_Tags)
mindex_score = metrics.mindex(LID_Tags)
lang_entropy_score = metrics.lang_entropy(LID_Tags)
spavg_score = metrics.spavg(LID_Tags)
i_index_score = metrics.i_index(LID_Tags)
burstiness_score = metrics.burstiness(LID_Tags)
symcom_pos_tags = symcom.symcom_pos_tags(cm_sentence)
symcom_sentence = symcom.symcom_sentence(cm_sentence)

print("CMI Score:", cmi_score)
print("M-Index Score:", mindex_score)
print("Lang Entropy Score:", lang_entropy_score)
print("SPAVG Score:", spavg_score)
print("I-Index Score:", i_index_score)
print("Burstiness Score:", burstiness_score)
print("")
for tag, score in symcom_pos_tags.items():
    print(tag, ":", score)
print("SymCom Sentence:", symcom_sentence)

In [None]:
from ai4bharat.transliteration import XlitEngine
e = XlitEngine( beam_width=10, src_script_type = "indic")
out = e.translit_sentence("मैं Hyderabaed में movie देखने जा रहा हूँ", 'hi')
print(out)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np

def predictposSent(model = None, tokenizer = None, sentence= None):
  
  tokenized_sentence = tokenizer(sentence,return_tensors='pt')

  mask = []
  prev_id = None
  for ind,id in enumerate(tokenized_sentence.word_ids()):
    
    if id is None:
      mask.append(-100)
    elif id == prev_id:
      mask.append(-100)
    elif id != prev_id:
      mask.append(id)
    prev_id = id


  outputs = model(**tokenized_sentence.to(device))

  preds = np.argmax(outputs['logits'].cpu().detach().numpy(), axis=2).squeeze()

  true_preds = [
      model.config.id2label[p] for (p, l) in zip(preds, mask) if l != -100
  ]
  
  return true_preds


tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

modelpath = "prakod/en-hi-pos-tagger-symcom"
model = AutoModelForTokenClassification.from_pretrained(modelpath)
model.to(device)

sentence ="मैं Hyderabaed में movie देखने जा रहा हूँ"

tags_normalised = predictposSent(model = model, 
                                 tokenizer= tokenizer, 
                                 sentence = sentence)

