In [1]:
!pip install mirascope pydantic groq

Collecting mirascope
  Downloading mirascope-1.16.7-py3-none-any.whl.metadata (8.5 kB)
Collecting docstring-parser<1.0,>=0.15 (from mirascope)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting jiter>=0.5.0 (from mirascope)
  Downloading jiter-0.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading mirascope-1.16.7-py3-none-any.whl (309 kB)
Downloading docstring_parser-0.16-py3-none-any.whl (36 kB)
Downloading jiter-0.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (345 kB)
Installing collected packages: jiter, docstring-parser, mirascope
Successfully installed docstring-parser-0.16 jiter-0.8.2 mirascope-1.16.7


In [53]:
gu_input = "./input/train.gu"
mr_input = "./input/train.mr"
te_input = "./input/train.te"
en_input = "./input/train.en"
hi_input = "./input/train.hi"

gu_output = "./output_sentence_ner/out_gu.json"
mr_output = "./output_sentence_ner/out_mr.json"
te_output = "./output_sentence_ner/out_te.josn"
en_output = "./output_sentence_ner/out_en.json"
hi_output = "./output_sentence_ner/out_hi.josn"

In [20]:
import ast
import csv
import os
import json
import time
import requests
from groq import Groq


In [None]:


api_key = os.getenv('API_KEY')
groq_client = Groq(api_key=api_key)

---
$$
\Large\text{Word level NER}
$$

In [5]:
def perform_ner_ollama_local(sentence):
    url = "http://localhost:11434/v1/chat/completions"
    payload = {
        "model": "llama3.1:latest",
        "messages": [
            {"role": "system", "content": """
                Extract named entities (Person, Organization, Location, Event, Product) from the given sentence.
                and give the output in following format
                {
                    "B-PER": [],
                    "I-PER": [],
                    "B-ORG": [],
                    "I-ORG": [],
                    "B-LOC": [],
                    "I-LOC": [],
                    "B-EVT": [],
                    "I-EVT": [],
                    "B-PROD": [],
                    "I-PROD": [],
                    "Others"[]
                }
                all the words which are not named entities should be in Others
                Dont write anything besides response
                """},
            {"role": "user", "content": sentence}
        ]
    } 

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print("Request failed:", e)
        return {}

    response_json = response.json()
    try:
        response_content = response_json["choices"][0]["message"]["content"].strip()
    except (KeyError, IndexError) as e:
        print("Unexpected response structure:", response_json)
        return {}

    if not response_content:
        print("Empty response")
        return {}

    try:
        ner_data = json.loads(response_content)
        return ner_data
    except json.JSONDecodeError:
        print("Error parsing JSON response:")
        print(response_content)
        return {}


In [6]:
def perform_ner(sentence):
    response = groq_client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {"role": "system", "content": """
                Extract named entities (Person, Organization, Location, Event, Product) into two categories (B and  I) from the given sentence.
                and give the output in following format
                {
                    "B-PER": [],
                    "I-PER": [],
                    "B-ORG": [],
                    "I-ORG": [],
                    "B-LOC": [],
                    "I-LOC": [],
                    "B-EVT": [],
                    "I-EVT": [],
                    "B-PROD": [],
                    "I-PROD": [],
                    "Others"[]
                }
                all the words which are not named entities should be in Others
                Dont write anything besides response
                """
            },
            {"role": "user", "content": sentence}
        ]
    )

    response_content = response.choices[0].message.content.strip()
    if not response_content:
        print("Empty response")
        return {}

    try:
        ner_data = json.loads(response_content)
        return ner_data
    except json.JSONDecodeError:
        print(response_content)
        print("Error parsing JSON response")
        return {}


In [7]:
def save_to_csv(data, filename="ner_output.csv"):
    if not data:
        return

    with open(filename, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(["Entity", "Type"])
        for entity_type, entities in data.items():
            for entity in entities:
                writer.writerow([entity, entity_type])

In [8]:
def ner_and_save_to_csv(input_file, output_file, start=0, end=10000, step=1):
    with open(input_file, mode="r", encoding="utf-8") as file:
        sentences = file.readlines()

    for i in range(start, end, step):
        merged_sentence = "".join(sentences[i:i+step])
        ner_result = perform_ner(merged_sentence)
        save_to_csv(ner_result, output_file)
        print(i+step)
        time.sleep(2)

---
$$
\Large\text{Sentences NER}
$$

In [None]:
def perform_ner_sentence(sentence):
    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": """
                Extract named entities (Person, Organization, Location, Event, Product) into two categories (B and I) from the given sentence.
                and give the output in the following format.
                Example input: "महेन्द्र साळुंखे यांनी ताज प्रोडक्शन कडून पुण्यातील 'मराठी महोत्सव' मध्ये नवीन स्मार्टवॉच प्रदर्शित केली." 
                Output:
                [
                    ["महेन्द्र", "साळुंखे", "यांनी", "ताज", "प्रोडक्शन", "कडून", "पुण्यातील", "'मराठी महोत्सव'", "मध्ये", "नवीन स्मार्टवॉच", "प्रदर्शित", "केली."],
                    ["B-PER", "I-PER", 0, "B-ORG", "I-ORG", 0, "B-LOC", "B-EVENT", 0, "B-PROD", 0, 0]
                ]
                All words that are not named entities should be marked as 0.
                Strictly output nothing but the response.
                """
            },
            {"role": "user", "content": sentence}
        ]
    )

    response_content = response.choices[0].message.content.strip()
    if not response_content:
        print("Empty response")
        return []

    if not response_content.startswith("["):
        # counting 3*[ and 3*] brackes then process
         if response_content.count('[') != 3 or response_content.count(']') != 3:
            print("Invalid response format")
            return []
        # string from fisrt occurance of [ to las occurance of ]
         start = response_content.index('[')
         end = response_content.rindex(']')
         stripped = response_content[start:end+1]
         response_content = stripped
 
    try:
        ner_data = ast.literal_eval(response_content)
    except Exception as e:
        print(response_content)
        print("Error parsing response:", e)
        return []

    # print(response_content)
    
    # Validate that ner_data is a list of exactly two lists.
    if not (isinstance(ner_data, list) and len(ner_data) == 2):
        print("Response format error: Expected a list with two elements.")
        return []

    tokens, labels = ner_data
    if not (isinstance(tokens, list) and isinstance(labels, list)):
        print("Response format error: Both elements must be lists.")
        return []

    if len(tokens) != len(labels):
        diff = len(tokens) - len(labels)
        if abs(diff) == 1:
            if diff == 1:
                # tokens list is longer by one; append 0 to labels
                labels.append(0)
            elif diff == -1:
                # labels list is longer by one; insert an empty string at the beginning of tokens
                tokens.insert(0, "")
        else:
            print("Response format error: Tokens and labels length mismatch.")
            return []

    return [tokens, labels]


In [45]:
sentence = "Elon Musk is the CEO of Tesla, and he was born in Pretoria, South Africa."
# sentence = "Barack Obama visited Microsoft headquarters in Seattle during a tech conference."
# sentence = "તમારા ઘરના નેર મારે એક બગીચો છે."
# print(perform_ner(sentence))
print(perform_ner_sentence(sentence))

[
    ["Elon", "Musk", "is", "the", "CEO", "of", "Tesla,", "and", "he", "was", "born", "in", "Pretoria,", "South", "Africa."],
    ["B-PER", "I-PER", 0, 0, 0, 0, "B-ORG", 0, 0, 0, 0, "B-LOC", "I-LOC", "I-LOC", 0]
]
[['Elon', 'Musk', 'is', 'the', 'CEO', 'of', 'Tesla,', 'and', 'he', 'was', 'born', 'in', 'Pretoria,', 'South', 'Africa.'], ['B-PER', 'I-PER', 0, 0, 0, 0, 'B-ORG', 0, 0, 0, 0, 'B-LOC', 'I-LOC', 'I-LOC', 0]]


In [46]:
def save_to_file(ner_result, output_file):
    """
    Save the ner_result (expected to be a list of two lists) to the output file. Each result is saved on a new line in JSON format.
    """
    with open(output_file, mode="a", encoding="utf-8") as f:
        result_str = json.dumps(ner_result, ensure_ascii=False)
        f.write(result_str + "\n")


In [None]:
def ner_and_save_to_csv(input_file, output_file, start=0, end=10000):
    with open(input_file, mode="r", encoding="utf-8") as file:
        sentences = file.readlines()

    for i in range(start, end):
        ner_result = perform_ner_sentence(sentences[i])
        try:
            if len(ner_result) == 2:
                save_to_file(ner_result, output_file)
                print('S: ',i)
            else:
                print('E: ',i)
            time.sleep(2)
        except Exception as e:
            print(e)
            print('E: ',i)
            time.sleep(2)

In [None]:
ner_and_save_to_csv(mr_input, mr_output,0,5)