In [1]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# CodeMixed Sentence Object

CodeMixSentence object serves as a comprehensive container for analyzing and quantifying code-mixing. It encapsulates 
- structural components of a sentence (tokens, language tags, part-of-speech tags) 
- various metrics that measure the degree and patterns of language mixing, such as Code-Mixing Index (CMI), burstiness, language entropy, and SyMCoM scores. 

The CodeMixSentence object provides a standardized framework for computing and storing code-mixing metrics, which is essential for several reasons:

- Standardisation: Having a standardized format, capturing necessary attributes of a code-mixed sentence, makes it easier to share and compare results across different datasets, and across the research community.

- Reproducibility & Consistency: By encapsulating all the metrics (CMI, burstiness, I-index, language entropy, etc.) in a single object with well-defined computation methods, it ensures that one can reproduce the same results when analyzing code-mixed text.

- Extensibility: The object provides a clear structure for adding new metrics or modifying existing ones while maintaining compatibility with the established framework. This is important as the field of code-mixing research continues to evolve.


The CodeMixSentence object provides a common ground for analysis.

In [2]:
tokens = ['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"']
LID_Tags = ['en', 'en', 'hi', 'hi', 'hi', 'hi', 'univ', 'univ', 'ne', 'univ', 'ne', 'univ']
PoS_Tags = ['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT']

lang_tags = ['hi', 'en']
other_tags = ['univ', 'ne', 'acro']

In [3]:
# from codemix.cs_metrics import CodeMixSentence
from codemixtoolkit import CodeMixSentence
codemixed_sentence = CodeMixSentence(lang_tagset = lang_tags, 
                                    other_tagset = other_tags, 
                                    l1 = 'en', 
                                    l2 = 'hi', 
                                    sentence = None, 
                                    tokens = tokens, 
                                    LID_Tags = LID_Tags, 
                                    PoS_Tags = PoS_Tags)

In [4]:
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=None,
    burstiness=None,
    i_index=None,
    lang_entropy=None,
    mindex=None,
    spavg=None,
    symcom_sentence=None
)


# CodeMix Metrics

In [5]:
'''
Computes all the metrics for the code-mixed sentence object which would have necessary fields like LID Tags and PoS Tags.
'''

codemixed_sentence.compute_all_metrics()
print(codemixed_sentence)

CodeMixSentenceCombined(
    lang_tagset=['hi', 'en'],
    other_tagset=['univ', 'ne', 'acro'],
    l1='en',
    l2='hi',
    sentence='None',
    tokens=['Gully', 'cricket', 'चल', 'रहा', 'हैं', 'यहां', '"', '(', 'Soniya', ')', 'Gandhi', '"'],
    LID_Tags=['en', 'en', 'hi', 'hi', 'hi', 'hi'],
    PoS_Tags=['ADJ', 'PROPN', 'VERB', 'AUX', 'AUX', 'ADV', 'PUNCT', 'PUNCT', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT'],
    length=12,
    cmi=33.333333333333336,
    burstiness=-0.3592455179659185,
    i_index=0.2,
    lang_entropy=0.9182958340544896,
    mindex=0.7999999999999999,
    spavg=1,
    symcom_sentence=0.6666666666666666
)


In [7]:
# from codemix.cs_metrics import CodeMixMetrics
from codemixtoolkit import CodeMixMetrics

# Compute CMI (Code-Mixing Index)
cmi_score = CodeMixMetrics.compute_cmi(lid_tags=LID_Tags, lang_tagset=['en', 'hi'])
print(f"CMI Score: {cmi_score}")


m_index = CodeMixMetrics.compute_mindex(lid_tags=LID_Tags, lang_tagset=['en', 'hi'])
print(f"M-index: {m_index}")


# Compute Language Entropy
entropy = CodeMixMetrics.compute_lang_entropy(lid_tags=LID_Tags, lang_tagset=['en', 'hi'])
print(f"Language Entropy: {entropy}")  

# Compute I-index
i_index = CodeMixMetrics.compute_i_index(lid_tags=LID_Tags, other_tagset=other_tags)
print(f"I-index: {i_index}")  

# Compute Burstiness
burstiness = CodeMixMetrics.compute_burstiness(lid_tags=LID_Tags, other_tagset=other_tags)
print(f"Burstiness: {burstiness}") 



from collections import Counter
LID_count_map = dict(Counter(LID_Tags).most_common())
PoS_count_map = dict(Counter(PoS_Tags).most_common())

lid_pos_combined = [pos + "_" + lid for lid, pos in zip(LID_Tags, PoS_Tags)]
LID_POS_count_map = dict(Counter(lid_pos_combined).most_common())

# Compute SyMCoM for POS tags
symcom_pos = CodeMixMetrics.compute_symcom_pos_tags(poS_count_map=PoS_count_map, 
                                                    lid_pos_count_map=LID_POS_count_map, 
                                                    l1='en', l2='hi')
print(f"SyMCoM POS scores: {symcom_pos}")  # Expected output: {'NOUN_symcom': 0.33, 'VERB_symcom': -1.0}

# Compute overall SyMCoM score
symcom_score = CodeMixMetrics.compute_symcom_sentence(poS_count_map=PoS_count_map, 
                                                      lid_pos_count_map = LID_POS_count_map, 
                                                      l1 = 'en', 
                                                      l2 = 'hi', 
                                                      length = len(LID_Tags))
print(f"Overall SyMCoM score: {symcom_score}")  # Expected output: ~0.53





CMI Score: 33.333333333333336
M-index: 0.7999999999999999
Language Entropy: 0.9182958340544896
I-index: 0.18181818181818182
Burstiness: -0.3592455179659185
SyMCoM POS scores: {'PROPN_symcom': 1.0, 'AUX_symcom': -1.0, 'ADJ_symcom': 1.0, 'VERB_symcom': -1.0, 'ADV_symcom': -1.0}
Overall SyMCoM score: 0.6666666666666666


# Visualization

In [8]:
from codemixtoolkit import codemix_viz as cv

annotation_printer = cv.AnnotatedTextPrinter()

In [9]:
annotation_printer.print_sample_st_annot_text(tokens, LID_Tags, PoS_Tags)

In [10]:
annotation_printer.export_html(file_name="codemix_visualization_test.html")

HTML file saved as codemix_visualization_test.html


# NER Tagger

In [11]:
from codemixtoolkit.models import NERtagger

ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER")
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

[False, 'ne', False, False, False, False, False, False]

In [12]:
ner_tagger = NERtagger(model_path="ai4bharat/IndicNER", 
                       tokenizer_path="ai4bharat/IndicNER",
                       finegrain_labels=True)
ner_tagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_ner_tags = ner_tagger.predict_ner_sentence(sentence)
predicted_ner_tags

['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O']

# PoS Tagger

In [13]:
from codemixtoolkit.models import PoSTagger

postagger = PoSTagger(model_path="prakod/en-hi-pos-tagger-symcom", 
                      tokenizer_path="xlm-roberta-base")
postagger.load_model_tokenizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
predicted_pos_tags = postagger.predict_pos_sentence(sentence)
print("Predicted POS tags:", predicted_pos_tags)


Predicted POS tags: ['PRON', 'PROPN', 'ADP', 'NOUN', 'VERB', 'VERB', 'AUX', 'AUX']


# Normalisation / Romanisation

In [14]:
from codemixtoolkit.models import Romanizer
romanizer = Romanizer()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
romanizer.romanize_text(sentence)


Initializing Multilingual model for transliteration


  state = torch.load(f, map_location=torch.device("cpu"))
Loading dicts into RAM: 100%|██████████| 1/1 [00:00<00:00, 23.13it/s]


'main hyderabaed main movie dekhane jaa rahaa hoon'

# Unicode Based LID

In [15]:
from codemixtoolkit.models import UnicodeLIDtagger

lid_tagger = UnicodeLIDtagger()
sentence = "मैं Hyderabaed में movie देखने जा रहा हूँ"
ner_predictions = [False, 'ne', False, False, False, False, False, False]
lid_tags = lid_tagger.get_unicode_lid_predictions(sentence,
                                      ner_predictions = ner_predictions)
print("LID tags:", lid_tags)


LID tags: (['मैं', 'Hyderabaed', 'में', 'movie', 'देखने', 'जा', 'रहा', 'हूँ'], ['hi', 'ne', 'hi', 'en', 'hi', 'hi', 'hi', 'hi'])


# LID Tagger and Normaliser

In [16]:
from codemixtoolkit.models import CSNLILIDClient

# Initialize the client (defaults to http://localhost:6000)
client = CSNLILIDClient(base_url = "http://localhost:6000")

print("--------------------------------")
print(f"CSNLI Service is available?: {client.is_service_available()}")
print("--------------------------------")

from pprint import pprint
# Check if service is available
if client.is_service_available():
    # Process text and get results as a dictionary
    result = client.get_lid("i thght mosam dfrnt hoga bs fog h")
    pprint("CSNLI API Result: ")
    print(f"Sentence: {result['text_str']}")
    print(f"Tokenized: {result['text_tokenized']}")
    print(f"Normalized: {result['norm_text']}")
    print(f"LID: {result['lid']}")
    print("--------------------------------")
    
else:
    print("CSNLI Service is not available. Please check the service is running and the base_url is correct.")


--------------------------------
CSNLI Service is available?: False
--------------------------------
CSNLI Service is not available. Please check the service is running and the base_url is correct.


In [17]:
# Check if service is available
if client.is_service_available():
    # Process text and print results as a pandas DataFrame
    print("--------------------------------")
    df = client.get_lid_and_print("i thght mosam dfrnt hoga bs fog h")
    print(f"CSNLI API Result as DataFrame: \n{df}")
    print("--------------------------------")
else:
    print("CSNLI Service is not available. Please check the service is running and the base_url is correct.")


CSNLI Service is not available. Please check the service is running and the base_url is correct.


# Rule Based Synthetic Code-Mixed Sentence Generation - NOUN, ADJ Replacement

In [19]:
from codemixtoolkit import tokenize_pos_awesome_align as tpa

filename = "unique_utterances_en_hi_transltions.json"
df_codemixed = tpa.get_codemix_candidates_for_file(filename)
df_codemixed

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /home/prashantk/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /home/prashantk/stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /home/prashantk/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /home/prashantk/stanza_resources
0it [00:00, ?it/s]INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /home/prashantk/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package     |
---------------------------
| tokenize  | hdtb        |
| pos       | hdtb_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!
1it [00:03,  3.74s/it]
1it [00:00, 22.29it/s]
1it [00:00, 5857.97it/s]

done





Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos,alignments_awesomealign,token_alignment_map_awesomealign,codemixed-sentences
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADV, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, DET, AUX, PUNCT]]",[0-0 0-1 1-2 2-5 3-4 4-6 5-3 6-4 7-7 ],"[{'Okay': 'है', 'ठीक': 'Okay', 'है': 'does', '...","Okay है , इसकी cost कितनी है ?"


# Synthetic Code-Mix Sentences Generated - GCM

- Ensure you are in the "library" folder

- Run these commands:
 ```
 >>> export FLASK_APP=gcmgenerator
 >>> flask run -h 0.0.0.0 -p 6000
 ```
- (change port and host details as required)


In [None]:
import requests

In [None]:
import docker

def is_image_running(image_name="prakod/gcm-codemix-generator"):
    client = docker.from_env()
    
    # List all running containers
    running_containers = client.containers.list()
    
    for container in running_containers:
        # Check if the container is using the specified image
        if container.image.tags and image_name in container.image.tags[0]:
            return True
    
    return False


In [None]:
if is_image_running(image_name="prakod/gcm-codemix-generator"):
    print("The image is running.")
else:
    print("The image is not running.")

The image is running.


In [None]:
# Check APIs are reachable

import requests

def check_api_reachable(url, method='GET', timeout=5, payload=None):
    """
    Checks if the given API endpoint is reachable.
    Args:
        url (str): The API endpoint URL.
        method (str): HTTP method to use ('GET' or 'POST').
        timeout (int): Timeout in seconds for the request.
        payload (dict): Data to send in case of POST.
    Returns:
        bool: True if reachable, False otherwise.
    """
    try:
        if method.upper() == 'POST':
            response = requests.post(url, json=payload, timeout=timeout)
        else:
            response = requests.get(url, timeout=timeout)
        if response.status_code == 200:
            return True
        else:
            print(f"API returned status code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"API is not available: {e}")
        return False



## ALIGNER

- Sentences are passed here, and the alignment is generated.

In [None]:
# alignment generation
l1 = "यदि आप तुरंत डॉक्टर से संपर्क करें"
l2 = "contact the doctor immediately if you"
td = {'l1':l1, 'l2':l2}
alignment_api_endpoint = "http://127.0.0.1:6000/statistical_aligner_enhi"



<Response [200]>
{'alignment': '0-4 1-5 2-3 3-2 4-0 5-0', 'l1': 'यदि आप तुरंत डॉक्टर से संपर्क करें', 'l2': 'contact the doctor immediately if you'}


In [None]:
# alignment generation
l1 = "यदि आप तुरंत डॉक्टर से संपर्क करें"
l2 = "contact the doctor immediately if you"
td = {'l1':l1, 'l2':l2}
alignment_api_endpoint = "http://127.0.0.1:6000/statistical_aligner_enhi"



<Response [200]>
{'alignment': '0-4 1-5 2-3 3-2 4-0 5-0', 'l1': 'यदि आप तुरंत डॉक्टर से संपर्क करें', 'l2': 'contact the doctor immediately if you'}


## CODE-MIXED SENTENCE GENERATOR

- Using the given sentences and alignment, codemixed sentences are generated

### Expected Outputs

- In case of any error during code-mix sentence generation, the program errors out with the message: 
```
fail
```

- Sometimes it is possible that no alignments can be generated, in which case the program returns an empty array.
- If any alignment error occurs then it is possible for the code-mixed sentence to skip a few words as well


In [None]:
# cm-sentences generation
choice = 2  #choice for language to generate parse trees
data = {
    "lang1": l1,
    "lang2": l2,
    "alignments": alignments,
    "choice": choice
}

gcm_api_endpoint = "http://127.0.0.1:6000/gcm_enhi"
#CODE FOUND IN gcmgenerator.py



In [None]:
# check if the gcm_enhi API is reachable
if not check_api_reachable("http://127.0.0.1:6000/gcm_enhi", method='POST', payload=data, timeout=60):
    print("The gcm_enhi API is not available.")
else:
    print("The gcm_enhi API is reachable.")


The gcm_enhi API is reachable.


In [None]:
response = requests.post(gcm_api_endpoint, json=data)
print(response)
#print(response.json())

retdata = response.json()
print("Sentence 1: ", retdata['lang1'])
print("Sentence 2: ", retdata['lang2'])
print("Alignments: ", retdata['alignments'])
for i in retdata['cm_sentences']:
    print(i)

<Response [200]>
Sentence 1:  यदि आप तुरंत डॉक्टर से संपर्क करें
Sentence 2:  contact the doctor immediately if you
Alignments:  ['0-4 1-5 2-3 3-2 4-0 5-0']
[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क करें

[L2]	contact the doctor immediately if you

[L2_Tree]	(ROOT (S (VP (VB contact) (NP (DT the) (NN doctor)) (ADVP (RB immediately)) (SBAR (IN if) (NP (PRP you))))))

Alignments	0-4 1-5 2-3 3-2 4-0 5-0

Theory	ec

[CM]contact the तुरंत if you

[TREE](ROOT (VP_e (VB_e contact) (NP_e (DT_e the)) (ADVP (RB_h तुरंत)) (SBAR (IN_e if) (NP (PRP_e you)))))



[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क करें

[L2]	contact the doctor immediately if you

[L2_Tree]	(ROOT (S (VP (VB contact) (NP (DT the) (NN doctor)) (ADVP (RB immediately)) (SBAR (IN if) (NP (PRP you))))))

Alignments	0-4 1-5 2-3 3-2 4-0 5-0

Theory	ec

[CM]contact the तुरंत if आप

[TREE](ROOT (VP_e (VB_e contact) (NP_e (DT_e the)) (ADVP (RB_h तुरंत)) (SBAR (IN_e if) (NP (PRP_h आप)))))



[IDX]	0

[L1]	यदि आप तुरंत डॉक्टर से संपर्क

# Data Module

In [20]:
from codemixtoolkit.data import (
    DATASET_REGISTRY,
    LanguagePair,
    TaskType,
    DatasetInfo
)

In [21]:
# 1. List all available datasets
print("Available datasets:")
print(DATASET_REGISTRY.list_datasets())

Available datasets:
['AcceptabilityEnHiClineGCM', 'AcceptabilityEnHiClineOSN', 'ToDDialogXRISAWOZ', 'SentimentEnHiPrabhuEtAl', 'SentimentEnHiGLUECOS', 'SentimentEnHiSentiMix', 'HateSpeechEnHiBohraEtAl']


In [22]:
# 2. List datasets by task type
print("\nDatasets for sequence classification:")
print(DATASET_REGISTRY.list_datasets_by_task(TaskType.SEQUENCE_CLASSIFICATION))



Datasets for sequence classification:
['ToDDialogXRISAWOZ', 'SentimentEnHiPrabhuEtAl', 'SentimentEnHiGLUECOS', 'SentimentEnHiSentiMix', 'HateSpeechEnHiBohraEtAl']


In [23]:
# 3. List datasets by language pair
print("\nDatasets for English-Hindi:")
print(DATASET_REGISTRY.list_datasets_by_languagepair(LanguagePair.EN_HI))



Datasets for English-Hindi:
-----------DATASET REGISTRY - START---------------------
AcceptabilityEnHiClineGCM
AcceptabilityEnHiClineOSN
ToDDialogXRISAWOZ
SentimentEnHiPrabhuEtAl
SentimentEnHiGLUECOS
SentimentEnHiSentiMix
HateSpeechEnHiBohraEtAl
-----------DATASET REGISTRY - END---------------------



In [24]:
# 4. Load a specific dataset
# Example: Loading the Sentiment dataset from Prabhu et al.
sentiment_dataset = DATASET_REGISTRY.get_dataset("HateSpeechEnHiBohraEtAl")
data = sentiment_dataset.load()


Using the latest cached version of the dataset since prakod/hate_speech_enhi_bohraetal couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/prashantk/.cache/huggingface/datasets/prakod___hate_speech_enhi_bohraetal/default/0.0.0/84d0496a1b0c26faa15fb6f93e4ae697fa488438 (last modified on Wed Feb 12 19:34:01 2025).


In [25]:
# 5. Get dataset information
dataset_info = sentiment_dataset.get_info()
print("\nDataset Information:")
print(f"Name: {dataset_info.name}")
print(f"Task Type: {dataset_info.task_type}")
print(f"Language Pair: {dataset_info.language_pair}")
print(f"Input Fields: {dataset_info.input_fields}")
print(f"Label Fields: {dataset_info.label_fields}")
print(f"Metrics: {dataset_info.metrics}")
print(f"Description: {dataset_info.description}")
print(f"Reference: {dataset_info.reference}")



Dataset Information:
Name: hate-speech-en-hi-bohra-et-al
Task Type: TaskType.SEQUENCE_CLASSIFICATION
Language Pair: LanguagePair.EN_HI
Input Fields: ['text']
Label Fields: ['label']
Metrics: ['accuracy']
Description: Hate Speech Dataset - En-Hi - Bohra et al.
Reference: https://arxiv.org/abs/2405.05572


In [26]:
# 6. Working with the loaded data
# The data is returned as a HuggingFace Dataset object
# You can access it like a pandas DataFrame
print("\nSample data from sentiment dataset:")
print(data['train'][0])  # Print first example from training set


Sample data from sentiment dataset:
{'text': 'Ek CM Gaaye Ki Seva Mai laga hua hai, aur baaki 3 CM #Padmavaat ko ban kraane Mai lage hue. Sabse jaada beshram, Haryana CM, Jo har Baar mauka deta hai sabko, kuch na kuch bolne pr, chaiye voh Ram Rahim ho, Brutal Rape ho ya Padmavati ho. I hate these 4 CMs of @BJP4India #Useless', 'label': 0, 'label_text': 'hate', 'label_id_str': 0}


# Evaluation Module

In [27]:
from codemixtoolkit.evaluation import (
    LLMEvaluator,
    PerplexityEvaluator,
    EvaluationMetrics
)
from codemixtoolkit.data import DATASET_REGISTRY

from codemixtoolkit.config import config


In [28]:
# Load the dataset
dataset_obj = DATASET_REGISTRY.get_dataset("HateSpeechEnHiBohraEtAl")

# Load the dataset with HuggingFace token
dataset_data = dataset_obj.load(token=config.HUGGINGFACE_API_KEY)


Using the latest cached version of the dataset since prakod/hate_speech_enhi_bohraetal couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/prashantk/.cache/huggingface/datasets/prakod___hate_speech_enhi_bohraetal/default/0.0.0/84d0496a1b0c26faa15fb6f93e4ae697fa488438 (last modified on Wed Feb 12 19:34:01 2025).


In [29]:
# Define label mapping for the dataset

dataset_label_mapping = {}
for label in dataset_data["train"].features["label"].names:
    dataset_label_mapping[label] = (
        dataset_data["train"].features["label"].str2int(label)
    )


instruction_label_mapping = {"hate_speech": 0, "not_hate_speech": 1}

print(f"Dataset label mapping: {dataset_label_mapping}")
print(f"Instruction label mapping: {instruction_label_mapping}")


Dataset label mapping: {'hate': 0, 'non-hate': 1}
Instruction label mapping: {'hate_speech': 0, 'not_hate_speech': 1}


In [30]:
# Select a sample from the dataset
sample = dataset_data["train"][0]
print("-------------------------------- ")
print("\nSample text:", sample["text"])
print(f"True label: {sample['label']}")
print(f"Label as string: {sample['label_text']}")
print("--------------------------------")

-------------------------------- 

Sample text: Ek CM Gaaye Ki Seva Mai laga hua hai, aur baaki 3 CM #Padmavaat ko ban kraane Mai lage hue. Sabse jaada beshram, Haryana CM, Jo har Baar mauka deta hai sabko, kuch na kuch bolne pr, chaiye voh Ram Rahim ho, Brutal Rape ho ya Padmavati ho. I hate these 4 CMs of @BJP4India #Useless
True label: 0
Label as string: hate
--------------------------------


In [31]:
# 1. LLM-based Evaluation Example
# Initialize LLM evaluator for sentiment classification

zero_shot_evaluator = LLMEvaluator(
        tasktype="sequence_classification",
        task="hate_speech",
        name="zero_shot_hate_speech",
        model="openrouter/meta-llama/llama-3.3-70b-instruct:free",
        temperature=0.1,
        max_tokens=10,
        instruction="""You are a hate speech classifier. Given a text, classify it as either 'hate speech' or 'not hate speech'.
Hate speech is defined as any form of expression that attacks or uses pejorative or discriminatory language with reference to a person or a group on the basis of who they are, in other words, based on their religion, ethnicity, nationality, race, color, descent, gender or other identity factor.

Respond with only one word: either 'hate_speech' or 'not_hate_speech'.""",
        dataset_label_mapping=dataset_label_mapping,
        instruction_label_mapping=instruction_label_mapping,
    )


In [32]:
# Get predictions and compare with ground truth using both evaluators
zero_shot_result = zero_shot_evaluator.evaluate_sample(
    sample["text"], sample["label"]
)


Response: hate_speech
Prediction with label mapping: 0
Correct Prediction! Prediction 0 matches ground truth 0


In [33]:
# Evaluate on 10 samples
print("\nEvaluating on 10 samples...")
dataset_eval_10 = zero_shot_evaluator.evaluate_dataset(
    dataset_obj,
    max_eval_samples=10,  # Limit to 10 samples
)


Evaluating on 10 samples...


Evaluating dataset: 100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


In [34]:
print("Classification Metrics:")
print(dataset_eval_10['metrics'])

Classification Metrics:
{'accuracy': 0.6, 'f1': 0.6380952380952382}
