### Task 1: Ethiomart Data Ingestion and  Data Preprocessing

In [1]:
import os
import sys
import pandas as pd
import asyncio
# Get the current working directory
current_dir = os.getcwd()

# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the TelegramScraper class from the file where it's defined
from scripts.telegram_scraper import TelegramScraper
from scripts.text_preprocessor import  AmharicTextPreprocessor

### Telegram Message Scraping

In [3]:
# List of channels to scrape
channels = ['modernshoppingcenter']

# Initialize the TelegramScraper instance
scraper = TelegramScraper()

In [4]:
# Define a function to run the asynchronous methods in the notebook
async def run_scraper():
    await scraper.connect()  # Connect to Telegram
    await scraper.fetch_messages(channels, message_limit=5000, output_file='../data/telegram_messages.csv')  # Fetch messages
    await scraper.disconnect()  # Disconnect from Telegram

In [5]:
# Run the scraper asynchronously in the notebook
await run_scraper()

Connected to Telegram
Joining and fetching messages from modernshoppingcenter...
Saving message 5382 from modernshoppingcenter
Saving message 5381 from modernshoppingcenter
Saving message 5380 from modernshoppingcenter
Saving message 5379 from modernshoppingcenter
Saving message 5378 from modernshoppingcenter
Saving message 5377 from modernshoppingcenter
Saving message 5376 from modernshoppingcenter
Saving message 5375 from modernshoppingcenter
Saving message 5374 from modernshoppingcenter
Saving message 5373 from modernshoppingcenter
Saving message 5372 from modernshoppingcenter
Saving message 5371 from modernshoppingcenter
Saving message 5370 from modernshoppingcenter
Saving message 5369 from modernshoppingcenter
Saving message 5368 from modernshoppingcenter
Saving message 5367 from modernshoppingcenter
Saving message 5366 from modernshoppingcenter
Saving message 5365 from modernshoppingcenter
Saving message 5364 from modernshoppingcenter
Saving message 5363 from modernshoppingcenter

In [7]:
# Read downloded CSV data into dataFrame
telegram_data = pd.read_csv('../data/telegram_messages.csv')

### Text Preprocessing

##### Initialization: Create an instance of the AmharicTextPreprocessor class.

In [8]:
text_processor = AmharicTextPreprocessor()

In [9]:
# Apply preprocessing to 'Message' column
telegram_data['Preprocessed_Message'] = telegram_data['Message'].apply(text_processor.preprocess_text)

# Display the preprocessed data
telegram_data[['Message', 'Preprocessed_Message']]

Unnamed: 0,Message,Preprocessed_Message
0,,
1,,
2,,
3,**ቴሌግራም****🫐**** **t.me/modernshoppingcenter**...,ቴሌግራም በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን የፍራፍሬ መጭመቂያ ብርቱካን ...
4,,
...,...,...
4788,እጅግ በጣም ኳሊቲ የቡና መፍጫ\nየማድቀቅ ደረጃውን በራሱ መቆጣጠሪያ ያ...,እጅግ በጣም ኳሊቲ የቡና መፍጫ የማድቀቅ ደረጃውን በራሱ መቆጣጠሪያ ያለው...
4789,ኳሊቲ የቡና ስኒ ከነማስቀመጫው፣ከነመስቀያው\nበ950ብር \nአድራሻ ጉርድ...,ኳሊቲ የቡና ስኒ ከነማስቀመጫውከነመስቀያው በ950ብር አድራሻ ጉርድ ሾላ ...
4790,ተንጠልጣይ የሻይ፣የቡና እና የወተት ኮምፕሌት\nበ600ብር ብቻ\nአድራሻ ...,ተንጠልጣይ የሻይየቡና እና የወተት ኮምፕሌት በ600ብር ብቻ አድራሻ ጉርድ...
4791,,


In [10]:
# filter Not null values from message columns
telegram_data = telegram_data[telegram_data['Preprocessed_Message']!=("")]
telegram_data

Unnamed: 0,channel,message_id,date,sender,Message,media,Preprocessed_Message
3,modernshoppingcenter,5379,2025-01-17T12:26:50+00:00,-1001649695480,**ቴሌግራም****🫐**** **t.me/modernshoppingcenter**...,MessageMediaPhoto,ቴሌግራም በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን የፍራፍሬ መጭመቂያ ብርቱካን ...
8,modernshoppingcenter,5374,2025-01-17T07:49:42+00:00,-1001649695480,"**ቴሌግራም👉t.me/modernshoppingcenter****\n ""በ...",MessageMediaPhoto,ቴሌግራም በአዲስ ነገር ሁሌም ቀዳሚዏች ነን አዲስ የጤና መጠበቂያ ማሽን ...
12,modernshoppingcenter,5370,2025-01-16T15:18:29+00:00,-1001649695480,**ቴሌግራም****🫐**** **t.me/modernshoppingcenter**...,MessageMediaDocument,ቴሌግራም በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን የልጆች ምግብ መስሪያ በከፍተ...
16,modernshoppingcenter,5366,2025-01-16T07:38:01+00:00,-1001649695480,**ቴሌግራም****🫐**** **t.me/modernshoppingcenter**...,MessageMediaPhoto,ቴሌግራም በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን ️ የፍራፍሬ አና የፓስታ ማጥ...
17,modernshoppingcenter,5365,2025-01-15T08:21:58+00:00,-1001649695480,**🔥**** ተጀመረ ****💡****ተጀመረ****💡**** ተጀመረ ****🔥...,MessageMediaPhoto,ተጀመረ ተጀመረ ተጀመረ ልዩ እዉነተኛ የበዓል ቅናሽ ከታህሳስ 21 እስከ ...
...,...,...,...,...,...,...,...
4785,modernshoppingcenter,11,2022-02-18T17:43:44+00:00,-1001649695480,የተለያየ ማራኪ ቅርጽ ያላችው የቡና ሲኒዎች\nበ600ብር ብቻ\nአድራሻ ጉ...,MessageMediaPhoto,የተለያየ ማራኪ ቅርጽ ያላችው የቡና ሲኒዎች በ600ብር ብቻ አድራሻ ጉርድ...
4787,modernshoppingcenter,9,2022-02-18T17:43:20+00:00,-1001649695480,ለጨው፣ለስኳር ማቅረቢያ\nከንፁህ ሴራሚክ የተሰራ\nበ400ብር ብቻ።\nአድ...,MessageMediaPhoto,ለጨውለስኳር ማቅረቢያ ከንፁህ ሴራሚክ የተሰራ በ400ብር ብቻ አድራሻ ጉር...
4788,modernshoppingcenter,8,2022-02-18T17:42:21+00:00,-1001649695480,እጅግ በጣም ኳሊቲ የቡና መፍጫ\nየማድቀቅ ደረጃውን በራሱ መቆጣጠሪያ ያ...,MessageMediaPhoto,እጅግ በጣም ኳሊቲ የቡና መፍጫ የማድቀቅ ደረጃውን በራሱ መቆጣጠሪያ ያለው...
4789,modernshoppingcenter,6,2022-02-18T17:40:19+00:00,-1001649695480,ኳሊቲ የቡና ስኒ ከነማስቀመጫው፣ከነመስቀያው\nበ950ብር \nአድራሻ ጉርድ...,MessageMediaPhoto,ኳሊቲ የቡና ስኒ ከነማስቀመጫውከነመስቀያው በ950ብር አድራሻ ጉርድ ሾላ ...


### Task 2 : Label a Subset of Dataset in CoNLL 

In [11]:
# Import EntityLabeler class
from scripts.data_labeling import EntityLabeler
# 1. Initialize the entity labeler
entity_labeler = EntityLabeler()

# 2. Generate CoNLL formatted string for messages from 0 to 50 (or any range dynamically)
start_message = 0
batch_size = 50  # You can adjust this to 50, 100, or any other size you want
conll_output = entity_labeler.create_conll_format(telegram_data, start_message, batch_size)


# # Generate for the next batch, for example 51-100
# start_message = 51
# batch_size = 50
# conll_output = entity_labeler.create_conll_format(telegram_data, start_message, batch_size)
# entity_labeler.save_conll_to_file(conll_output, f'telegram_data_conll_format_{start_message}_{start_message + batch_size}.txt')

### Save the data in CoNLL format

In [12]:
# Save the generated output to a text file
entity_labeler.save_conll_to_file(conll_output, f'telegram_data_conll_format_{start_message}_{start_message + batch_size}.txt')

CoNLL format data has been saved to 'telegram_data_conll_format_0_50.txt'
