In [1]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
tokens = ['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"']
LID_Tags = ['en', 'en', 'hi', 'hi', 'hi', 'hi', 'univ', 'univ', 'ne', 'univ', 'ne', 'univ']
PoS_Tags = ['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT']

lang_tags = ['hi', 'en']
other_tags = ['univ', 'ne', 'acro']

# CodeMixed Sentence Object

CodeMixSentence object serves as a comprehensive container for analyzing and quantifying code-mixing. It encapsulates 
- structural components of a sentence (tokens, language tags, part-of-speech tags) 
- various metrics that measure the degree and patterns of language mixing, such as Code-Mixing Index (CMI), burstiness, language entropy, and SyMCoM scores. 

The CodeMixSentence object provides a standardized framework for computing and storing code-mixing metrics, which is essential for several reasons:

- Standardisation: Having a standardized format, capturing necessary attributes of a code-mixed sentence, makes it easier to share and compare results across different datasets, and across the research community.

- Reproducibility & Consistency: By encapsulating all the metrics (CMI, burstiness, I-index, language entropy, etc.) in a single object with well-defined computation methods, it ensures that one can reproduce the same results when analyzing code-mixed text.

- Extensibility: The object provides a clear structure for adding new metrics or modifying existing ones while maintaining compatibility with the established framework. This is important as the field of code-mixing research continues to evolve.


The CodeMixSentence object provides a common ground for analysis.

In [3]:
from codemix.cs_metrics import CodeMixSentence
codemixed_sentence = CodeMixSentence(lang_tagset = lang_tags, 
                                    other_tagset = other_tags, 
                                    l1 = 'en', 
                                    l2 = 'hi', 
                                    sentence = None, 
                                    tokens = tokens, 
                                    LID_Tags = LID_Tags, 
                                    PoS_Tags = PoS_Tags)

In [4]:
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=None,
    burstiness=None,
    i_index=None,
    lang_entropy=None,
    mindex=None,
    spavg=None,
    symcom_sentence=None
)


# CodeMix Metrics

In [5]:
'''
Computes all the metrics for the code-mixed sentence object which would have necessary fields like LID Tags and PoS Tags.
'''

codemixed_sentence.compute_all_metrics()
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=33.333333333333336,
    burstiness=-0.3592455179659185,
    i_index=0.2,
    lang_entropy=0.9182958340544896,
    mindex=0.7999999999999999,
    spavg=1,
    symcom_sentence=0.6666666666666666
)


# Visualization

In [6]:
from codemix import codemix_viz as cv

annotation_printer = cv.AnnotatedTextPrinter()

In [7]:
annotation_printer.print_sample_st_annot_text(tokens, LID_Tags, PoS_Tags)

In [8]:
annotation_printer.export_html(file_name="codemix_visualization_test.html")

HTML file saved as codemix_visualization_test.html


# NER Tagger

In [9]:
from codemix.compute import NERtagger

ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER")
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

  from .autonotebook import tqdm as notebook_tqdm


[False, 'ne', False, False, False, False, False, False]

In [10]:
ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER",
                       finegrain_labels=True)
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O']

# PoS Tagger

In [11]:
from codemix.compute import PoSTagger

postagger = PoSTagger(model_path="prakod/en-hi-pos-tagger-symcom", 
                      tokenizer_path="xlm-roberta-base")
postagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_pos_tags = postagger.predict_pos_sentence(sentence)
print("Predicted POS tags:", predicted_pos_tags)


Predicted POS tags: ['PRON', 'PROPN', 'ADP', 'NOUN', 'VERB', 'VERB', 'AUX', 'AUX']


# Unicode Based LID

In [12]:
from codemix.compute import UnicodeLIDtagger

lid_tagger = UnicodeLIDtagger()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
ner_predictions = [False, 'ne', False, False, False, False, False, False]
lid_tags = lid_tagger.get_unicode_lid_predictions(sentence,
                                      ner_predictions = ner_predictions)
print("LID tags:", lid_tags)



LID tags: (['मैं', 'Hyderabaed', 'में', 'movie', 'देखने', 'जा', 'रहा', 'हूँ'], ['hi', 'ne', 'hi', 'en', 'hi', 'hi', 'hi', 'hi'])


# Normalisation / Romanisation

In [13]:
from codemix.normalize import Normalizer
normalizer = Normalizer()

sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
normalizer.normalize_text(sentence)

Initializing Multilingual model for transliteration


  state = torch.load(f, map_location=torch.device("cpu"))
Loading dicts into RAM: 100%|██████████| 1/1 [00:00<00:00, 18.32it/s]


'main hyderabaed main movie dekhane jaa rahaa hoon'

# Rule Based Synthetic Code-Mixed Sentence Generation - NOUN, ADJ Replacement

In [14]:
from codemix import tokenize_pos_awesome_align as tpa

filename = "unique_utterances_en_hi_transltions.json"
df_codemixed = tpa.get_codemix_candidates_for_file(filename)
df_codemixed

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 23.1MB/s]                    
INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /home/prashantk/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /home/prashantk/stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 24.0MB/s]                    
INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /home/prashantk/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /home/prashantk/stanza_resources
0it [00:00, ?it/s]INFO:stanza:Checking for updates to resources.json in case models have

done





Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos,alignments_awesomealign,token_alignment_map_awesomealign,codemixed-sentences
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADV, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, DET, AUX, PUNCT]]",[0-0 0-1 1-2 2-5 3-4 4-6 5-3 6-4 7-7 ],"[{'Okay': 'है', 'ठीक': 'Okay', 'है': 'does', '...","Okay है , इसकी cost कितनी है ?"


# LID Tagger and Normaliser

In [15]:
import requests

url = "http://localhost:6000/csnli-lid"
headers = {"Content-Type": "application/json"}
data = {"text": "i thght mosam dfrnt hoga bs fog h"}

response = requests.post(url, headers=headers, json=data)
csnli_output = response.json()['csnli_op']

print(f"Sentence: {csnli_output['text_str']}")

#convert csnli_output to a pandas dataframe and print it and do not keep "text_str" in the dataframe 
import pandas as pd
df = pd.DataFrame(csnli_output)
df = df.drop(columns=['text_str'])


from IPython.display import display
display(df)

Sentence: i thght mosam dfrnt hoga bs fog h


Unnamed: 0,text_tokenized,norm_text,lid
0,i,i,en
1,thght,thought,en
2,mosam,मौसम,hi
3,dfrnt,different,en
4,hoga,होगा,hi
5,bs,बस,hi
6,fog,fog,en
7,h,है,hi


# Synthetic Code-Mixed Sentence Generation using GCM Toolkit



## Instructions to run the flask API: 

- Ensure you are in the "library" folder

- Run these commands:
 ```
 >>> export FLASK_APP=gcmgenerator
 >>> flask run -h 0.0.0.0 -p 6000
 ```
- (change port and host details as required)


In [None]:
import requests

In [None]:
import docker

def is_image_running(image_name="prakod/gcm-codemix-generator"):
    client = docker.from_env()
    
    # List all running containers
    running_containers = client.containers.list()
    
    for container in running_containers:
        # Check if the container is using the specified image
        if container.image.tags and image_name in container.image.tags[0]:
            return True
    
    return False


In [None]:
if is_image_running(image_name="prakod/gcm-codemix-generator"):
    print("The image is running.")
else:
    print("The image is not running.")

The image is running.


In [None]:
# Check APIs are reachable

import requests

def check_api_reachable(url, method='GET', timeout=5, payload=None):
    """
    Checks if the given API endpoint is reachable.
    Args:
        url (str): The API endpoint URL.
        method (str): HTTP method to use ('GET' or 'POST').
        timeout (int): Timeout in seconds for the request.
        payload (dict): Data to send in case of POST.
    Returns:
        bool: True if reachable, False otherwise.
    """
    try:
        if method.upper() == 'POST':
            response = requests.post(url, json=payload, timeout=timeout)
        else:
            response = requests.get(url, timeout=timeout)
        if response.status_code == 200:
            return True
        else:
            print(f"API returned status code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"API is not available: {e}")
        return False



## ALIGNER

- Sentences are passed here, and the alignment is generated.

In [None]:
# alignment generation
l1 = "यदि आप तुरंत डॉक्टर से संपर्क करें"
l2 = "contact the doctor immediately if you"
td = {'l1':l1, 'l2':l2}
alignment_api_endpoint = "http://127.0.0.1:6000/statistical_aligner_enhi"



<Response [200]>
{'alignment': '0-4 1-5 2-3 3-2 4-0 5-0', 'l1': 'यदि आप तुरंत डॉक्टर से संपर्क करें', 'l2': 'contact the doctor immediately if you'}


In [None]:
if not check_api_reachable("http://127.0.0.1:6000/statistical_aligner_enhi", method='POST', payload={'l1': 'test', 'l2': 'test'}):
    print("The statistical_aligner_enhi API is not available.")
else:
    print("The statistical_aligner_enhi API is reachable.")

In [None]:
response = requests.post(alignment_api_endpoint, json = td)

print(response)

print(response.json())

aligner_output = response.json()
alignments = aligner_output['alignment']

## CODE-MIXED SENTENCE GENERATOR

- Using the given sentences and alignment, codemixed sentences are generated

### Expected Outputs

- In case of any error during code-mix sentence generation, the program errors out with the message: 
```
fail
```

- Sometimes it is possible that no alignments can be generated, in which case the program returns an empty array.
- If any alignment error occurs then it is possible for the code-mixed sentence to skip a few words as well


In [None]:
# cm-sentences generation
choice = 2  #choice for language to generate parse trees
data = {
    "lang1": l1,
    "lang2": l2,
    "alignments": alignments,
    "choice": choice
}

gcm_api_endpoint = "http://127.0.0.1:6000/gcm_enhi"
#CODE FOUND IN gcmgenerator.py



In [None]:
# check if the gcm_enhi API is reachable
if not check_api_reachable("http://127.0.0.1:6000/gcm_enhi", method='POST', payload=data, timeout=60):
    print("The gcm_enhi API is not available.")
else:
    print("The gcm_enhi API is reachable.")


The gcm_enhi API is reachable.


In [None]:
response = requests.post(gcm_api_endpoint, json=data)
print(response)
#print(response.json())

retdata = response.json()
print("Sentence 1: ", retdata['lang1'])
print("Sentence 2: ", retdata['lang2'])
print("Alignments: ", retdata['alignments'])
for i in retdata['cm_sentences']:
    print(i)

<Response [200]>
Sentence 1:  यदि आप तुरंत डॉक्टर से संपर्क करें
Sentence 2:  contact the doctor immediately if you
Alignments:  ['0-4 1-5 2-3 3-2 4-0 5-0']
[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क करें

[L2]	contact the doctor immediately if you

[L2_Tree]	(ROOT (S (VP (VB contact) (NP (DT the) (NN doctor)) (ADVP (RB immediately)) (SBAR (IN if) (NP (PRP you))))))

Alignments	0-4 1-5 2-3 3-2 4-0 5-0

Theory	ec

[CM]contact the तुरंत if you

[TREE](ROOT (VP_e (VB_e contact) (NP_e (DT_e the)) (ADVP (RB_h तुरंत)) (SBAR (IN_e if) (NP (PRP_e you)))))



[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क करें

[L2]	contact the doctor immediately if you

[L2_Tree]	(ROOT (S (VP (VB contact) (NP (DT the) (NN doctor)) (ADVP (RB immediately)) (SBAR (IN if) (NP (PRP you))))))

Alignments	0-4 1-5 2-3 3-2 4-0 5-0

Theory	ec

[CM]contact the तुरंत if आप

[TREE](ROOT (VP_e (VB_e contact) (NP_e (DT_e the)) (ADVP (RB_h तुरंत)) (SBAR (IN_e if) (NP (PRP_h आप)))))



[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क

# OLD / Prototype/ Testing - Delete Later

In [None]:
metrics = cs_metrics.CodeMixMetrics(lang_tags, other_tags)
symcom = cs_metrics.SyMCoM(LID_Tags, PoS_Tags, 'en', 'hi')
cm_sentence = cs_metrics.CodeMixSentence(sentence = None, tokens = tokens, LID_Tags = LID_Tags, PoS_Tags = PoS_Tags)

In [None]:
cmi_score = metrics.cmi(LID_Tags)
mindex_score = metrics.mindex(LID_Tags)
lang_entropy_score = metrics.lang_entropy(LID_Tags)
spavg_score = metrics.spavg(LID_Tags)
i_index_score = metrics.i_index(LID_Tags)
burstiness_score = metrics.burstiness(LID_Tags)
symcom_pos_tags = symcom.symcom_pos_tags(cm_sentence)
symcom_sentence = symcom.symcom_sentence(cm_sentence)

print("CMI Score:", cmi_score)
print("M-Index Score:", mindex_score)
print("Lang Entropy Score:", lang_entropy_score)
print("SPAVG Score:", spavg_score)
print("I-Index Score:", i_index_score)
print("Burstiness Score:", burstiness_score)
print("")
for tag, score in symcom_pos_tags.items():
    print(tag, ":", score)
print("SymCom Sentence:", symcom_sentence)

In [None]:
from ai4bharat.transliteration import XlitEngine
e = XlitEngine( beam_width=10, src_script_type = "indic")
out = e.translit_sentence("मैं Hyderabaed में movie देखने जा रहा हूँ", 'hi')
print(out)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np

def predictposSent(model = None, tokenizer = None, sentence= None):
  
  tokenized_sentence = tokenizer(sentence,return_tensors='pt')

  mask = []
  prev_id = None
  for ind,id in enumerate(tokenized_sentence.word_ids()):
    
    if id is None:
      mask.append(-100)
    elif id == prev_id:
      mask.append(-100)
    elif id != prev_id:
      mask.append(id)
    prev_id = id


  outputs = model(**tokenized_sentence.to(device))

  preds = np.argmax(outputs['logits'].cpu().detach().numpy(), axis=2).squeeze()

  true_preds = [
      model.config.id2label[p] for (p, l) in zip(preds, mask) if l != -100
  ]
  
  return true_preds


tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

modelpath = "prakod/en-hi-pos-tagger-symcom"
model = AutoModelForTokenClassification.from_pretrained(modelpath)
model.to(device)

sentence ="मैं Hyderabaed में movie देखने जा रहा हूँ"

tags_normalised = predictposSent(model = model, 
                                 tokenizer= tokenizer, 
                                 sentence = sentence)

