# Dataset Load and Analysis or EDA

In [1]:


import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('../data/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../data/input/NER_Training_Data.xlsx


In [2]:
df = pd.read_excel('../data/input/NER_Training_Data.xlsx')
df.head(10)

Unnamed: 0,Messege,username,authorname,Address,bookname,productname,publishername,categoryname,brandname,date,phoneno,orderno,trnx ID
0,ami ei book ta nite chai,,,,,,,,,,,,
1,Boi kinte cai ami,,,,,,,,,,,,
2,"Helo My Name is Rajib, Cell: 1744898066. I nee...",Rajib,"Language Guru, Amy Gillett",,"English Conversation Made Natural, Speak Engli...",,,,,,1744898000.0,,
3,Apnara ki IELTS book sell koren,,,,IELTS,,,,,,,,
4,পদ্মজা black edition tar price koto,,,,পদ্মজা,,,,,,,,
5,amake Higher math 1st paper er guide pdf deoya...,,,,Higher math 1st paper,,,,,,,,
6,সমরেশ মজুমদারের কালবেলা,,সমরেশ মজুমদারের,,কালবেলা,,,,,,,,
7,poddoja black addition hbe.?,,,,poddoja black addition,,,,,,,,
8,Ai boi duta hobe?,,,,,,,,,,,,
9,but cant found this in your website,,,,,,,,,,,,


In [3]:
# extract messages
messages_df = df['Messege']
messages_df

0                                 ami ei book ta nite chai
1                                        Boi kinte cai ami
2        Helo My Name is Rajib, Cell: 1744898066. I nee...
3                          Apnara ki IELTS book sell koren
4                      পদ্মজা black edition tar price koto
                               ...                        
11980      Jafor ikbal r humayun Ahmed ar boi ki pawa jabe
11981        সমরেশ মজুমদারের কালপুরুষ উপন্যাসটার দাম কত,,?
11982     Priyotomo oshukh she by sadat Hossain available?
11983                            Price koto janaben please
11984                      VARSITY A UNIT WRITTEN MEGABOOK
Name: Messege, Length: 11985, dtype: object

## Remove URL

In [4]:
import pandas as pd
import re

def remove_urls(text):
    """Remove URLs while preserving the rest of the text"""
    if pd.isna(text):
        return text
    
    # URL pattern
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    text = re.sub(url_pattern, '', str(text))
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

messages_df = messages_df.apply(remove_urls)

messages_df.head(10)

0                             ami ei book ta nite chai
1                                    Boi kinte cai ami
2    Helo My Name is Rajib, Cell: 1744898066. I nee...
3                      Apnara ki IELTS book sell koren
4                  পদ্মজা black edition tar price koto
5    amake Higher math 1st paper er guide pdf deoya...
6                              সমরেশ মজুমদারের কালবেলা
7                         poddoja black addition hbe.?
8                                    Ai boi duta hobe?
9                  but cant found this in your website
Name: Messege, dtype: object

## Remove Empty Srting

In [5]:
import pandas as pd
import re

def clean_messages(messages_df):
    # Remove URLs
    messages_df = messages_df.apply(lambda x: re.sub(r'http[s]?://\S+', '', str(x)))
    
    # Remove empty strings and whitespace-only strings
    messages_df = messages_df[messages_df.str.strip() != '']
    
    return messages_df

messages_df = clean_messages(messages_df)

print("Sample messages:")
print(messages_df.head())
print(f"\nTotal messages after cleaning: {len(messages_df)}")

Sample messages:
0                             ami ei book ta nite chai
1                                    Boi kinte cai ami
2    Helo My Name is Rajib, Cell: 1744898066. I nee...
3                      Apnara ki IELTS book sell koren
4                  পদ্মজা black edition tar price koto
Name: Messege, dtype: object

Total messages after cleaning: 11923


## Null Value Check

In [6]:
messages_df.isna().sum()

0

## Duplicate checking

In [7]:
messages_df.duplicated().sum()

3

## Remove Duplicate

In [8]:
messages_df.drop_duplicates(inplace=True)

## Convert Data Frame to List

In [9]:
messages = messages_df.tolist()

# Display 10 messages from top
messages[:10]

['ami ei book ta nite chai',
 'Boi kinte cai ami',
 'Helo My Name is Rajib, Cell: 1744898066. I need two books below: 1. "English Conversation Made Natural" by Language Guru, 2. Speak English Like an American" by Amy Gillett,',
 'Apnara ki IELTS book sell koren',
 'পদ্মজা black edition tar price koto',
 'amake Higher math 1st paper er guide pdf deoya zabe',
 'সমরেশ মজুমদারের কালবেলা',
 'poddoja black addition hbe.?',
 'Ai boi duta hobe?',
 'but cant found this in your website']

# Load Model

In [10]:
from dotenv import load_dotenv
#from kaggle_secrets import UserSecretsClient
#user_secrets = UserSecretsClient()
load_dotenv()


#api_key = user_secrets.get_secret("OPENAI_API_KEY")
api_key = os.getenv("OPENAI_API_KEY")


In [11]:
from openai import OpenAI

client = OpenAI(api_key=api_key)

## Define Structured Outputs Formate

In [12]:
from pydantic import BaseModel, Field
from typing import Optional

class NER(BaseModel):
    Message: Optional[str] = Field(None, description="The message or text to be analyzed.")
    username: Optional[str] = Field(None, description="The user's name entity.")
    authorname: Optional[str] = Field(None, description="The name of the author.")
    Address: Optional[str] = Field(None, description="The address entity.")
    bookname: Optional[str] = Field(None, description="The name of the book.")
    productname: Optional[str] = Field(None, description="The name of the product.")
    publishername: Optional[str] = Field(None, description="The name of the publisher.")
    categoryname: Optional[str] = Field(None, description="The name of the category.")
    brandname: Optional[str] = Field(None, description="The name of the brand.")
    date: Optional[str] = Field(None, description="Date information.")
    phoneno: Optional[str] = Field(None, description="The phone number entity.")
    orderno: Optional[str] = Field(None, description="The order number entity.")
    trnx_ID: Optional[str] = Field(None, description="The transaction ID entity.")



# Test Single Massage and extract NER

In [13]:
messages[2]

'Helo My Name is Rajib, Cell: 1744898066. I need two books below: 1. "English Conversation Made Natural" by Language Guru, 2. Speak English Like an American" by Amy Gillett,'

In [14]:
system_prompt = "You are expart to extract the user information with NER. User provided messages will Bengla, English or Both"

In [15]:

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"This is user message:\n{messages[2]}"},
    ],
    response_format=NER,
)

usr_ner = completion.choices[0].message.parsed
usr_ner

NER(Message='Helo My Name is Rajib, Cell: 1744898066. I need two books below: 1. "English Conversation Made Natural" by Language Guru, 2. Speak English Like an American" by Amy Gillett,', username='Rajib', authorname=None, Address=None, bookname='English Conversation Made Natural, Speak English Like an American', productname=None, publishername=None, categoryname=None, brandname=None, date=None, phoneno='1744898066', orderno=None, trnx_ID=None)

### Convert It Dectionary

In [16]:
import json

ner_dict = {
    "Message": usr_ner.Message,
    # "Message": messages[2],
    "username": usr_ner.username,
    "authorname": usr_ner.authorname,
    "Address": usr_ner.Address,
    "bookname": usr_ner.bookname,
    "productname": usr_ner.productname,
    "publishername": usr_ner.publishername,
    "categoryname": usr_ner.categoryname,
    "brandname": usr_ner.brandname,
    "date": usr_ner.date,
    "phoneno": usr_ner.phoneno,
    "orderno": usr_ner.orderno,
    "trnx_ID": usr_ner.trnx_ID,
}

ner_json = json.dumps(ner_dict, ensure_ascii=False, indent=4)
print(ner_json)

{
    "Message": "Helo My Name is Rajib, Cell: 1744898066. I need two books below: 1. \"English Conversation Made Natural\" by Language Guru, 2. Speak English Like an American\" by Amy Gillett,",
    "username": "Rajib",
    "authorname": null,
    "Address": null,
    "bookname": "English Conversation Made Natural, Speak English Like an American",
    "productname": null,
    "publishername": null,
    "categoryname": null,
    "brandname": null,
    "date": null,
    "phoneno": "1744898066",
    "orderno": null,
    "trnx_ID": null
}


In [20]:
from tqdm import tqdm

parsed_events = []

for message in tqdm(messages[:1000], desc="Processing User Messages"):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"This is user message:\n{message}"},
        ],
        response_format=NER,
    )
    usr_message = completion.choices[0].message.parsed

  
    usr_message_dict = {
        "Message": message,
        "username": usr_message.username,
        "authorname": usr_message.authorname,
        "Address": usr_message.Address,
        "bookname": usr_message.bookname,
        "productname": usr_message.productname,
        "publishername": usr_message.publishername,
        "categoryname": usr_message.categoryname,
        "brandname": usr_message.brandname,
        "date": usr_message.date,
        "phoneno": usr_message.phoneno,
        "orderno": usr_message.orderno,
        "trnx_ID": usr_message.trnx_ID,
    }

    parsed_events.append(usr_message_dict)

df = pd.DataFrame(parsed_events)

# Save the DataFrame to a CSV file
csv_file = "../data/output/users_NER.csv"
df.to_csv(csv_file, index=False)

print(f"CSV file '{csv_file}' created successfully.")


Processing User Messages: 100%|██████████| 1000/1000 [40:00<00:00,  2.40s/it]

CSV file '../data/output/users_NER.csv' created successfully.





In [21]:
df

Unnamed: 0,Message,username,authorname,Address,bookname,productname,publishername,categoryname,brandname,date,phoneno,orderno,trnx_ID
0,ami ei book ta nite chai,,,,,book,,,,,,,
1,Boi kinte cai ami,,,,,Boi,,,,,,,
2,"Helo My Name is Rajib, Cell: 1744898066. I nee...",Rajib,,,"""English Conversation Made Natural"", ""Speak En...",,,,,,1744898066,,
3,Apnara ki IELTS book sell koren,,,,IELTS book,,,,,,,,
4,পদ্মজা black edition tar price koto,,,,,পদ্মজা black edition,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ei book tar English pawa jabe?,,,,,book,,,,,,,
996,বেসরকারি শিক্ষক নিবন্ধন লিখিত বই পাওয়া যাবে?,,,,বেসরকারি শিক্ষক নিবন্ধন লিখিত বই,,,,,,,,
997,shundoR uponnash er boi dekhte chai,,,,shundoR uponnash,,,boi,,,,,
998,But 299tk order e notebook ta paini,,,,,notebook,,,,,,299tk,
