In [None]:
# ------------------------
# SETUP AND IMPORTS
# ------------------------

from telethon.sync import TelegramClient
from telethon.tl.types import InputMessagesFilterPhotos, InputMessagesFilterDocument
from dotenv import load_dotenv
import os
import pandas as pd
import re

load_dotenv()
api_id = os.getenv("API_ID")
api_hash = os.getenv("API_HASH")

if not api_id:
    api_id = input("Enter your Telegram API_ID: ")
if not api_hash:
    api_hash = input("Enter your Telegram API_HASH: ")

# connect to the Telegram client
# Make sure to replace 'ethio_ingestor' with your desired session name
client = TelegramClient('ethio_ingestor', api_id, api_hash)

async def main():
    await client.start()
    print("Client created successfully.")
    me = await client.get_me()
    print(f"Logged in as: {me.first_name} (ID: {me.id})")
    
await main()

Client created successfully.
Logged in as: Sam (ID: 494623506)


In [None]:

# -------------------------------------------------
# INGESTION AND PREPROCESSING TELEGRAM CHANNEL DATA
# --------------------------------------------------

# Method for tokenizing Amharic text
def tokenize_amharic(text):
    tokens = re.findall(r'[\u1200-\u137F]+', text)
    return tokens

# Method for normalizing Amharic text
def normalize_amharic(text):
    replacements = {
        "ሀ": "ሃ", "ሐ": "ሓ", "ኀ": "ኃ",
        "ሰ": "ሠ", "ጸ": "ፀ"
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    return text

async def ingest_and_preprocess():
    data = []
    channel_names = ["@ethio_brand_collection", "@helloomarketethiopia", "@qnashcom", "@Shewabrand", "@marakibrand"]
    photo_dir = "../../data/photos"
    os.makedirs(photo_dir, exist_ok=True)
    async with client:
        for channel in channel_names:
            async for msg in client.iter_messages(channel, limit=500):
                text = msg.text or ""
                tokens = tokenize_amharic(text)
                norm_text = normalize_amharic(text)
                photo_path = None
                if msg.photo:
                    photo_path = await msg.download_media(file=photo_dir)
                document_path = None
                if msg.document:
                    document_path = await msg.download_media(file=photo_dir)
                data.append({
                    "channel": channel,
                    "message_id": msg.id,
                    "sender": getattr(msg.sender, "username", None),
                    "timestamp": msg.date,
                    "text": text,
                    "tokens": tokens,
                    "normalized_text": norm_text,
                    "photo_path": photo_path,
                    "document_path": document_path
                })
    df = pd.DataFrame(data)
    df.to_csv("../../data/telegram_data.csv", index=False)
    print("Data saved to ../../data/telegram_data.csv")

await ingest_and_preprocess()