In [2]:
!pip install telethon pandas python-dotenv pillow asyncio nest_asyncio




In [1]:
# Import all necessary libraries
import os
import pandas as pd
import asyncio
import nest_asyncio
from telethon import TelegramClient
from telethon.errors import SessionPasswordNeededError
from datetime import datetime, timezone
import re
import json
from pathlib import Path
import time

# Allow nested async loops (needed for Jupyter notebooks)
nest_asyncio.apply()

In [3]:
# Create a .env file in your project directory with these values:
"""
API_ID= 22693668
API_HASH=11fe37717321eac9c54c5c2a1a3e220a
PHONE_NUMBER=+251936728385
"""

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Get API credentials from environment variables
API_ID = os.getenv('API_ID')
API_HASH = os.getenv('API_HASH')
PHONE_NUMBER = os.getenv('PHONE_NUMBER')

# If you don't want to use .env file, you can set them directly here:
# WARNING: Don't share these credentials publicly!
if not API_ID:
    API_ID = '22693668'  # Replace with your actual API ID
    API_HASH = '11fe37717321eac9c54c5c2a1a3e220a'  # Replace with your actual API hash
    PHONE_NUMBER = '+251936728385'  # Replace with your phone number

print("✅ Configuration loaded successfully!")

✅ Configuration loaded successfully!


In [4]:

# List of Ethiopian e-commerce Telegram channels to scrape
TELEGRAM_CHANNELS = [
    '@helloomarketethiopia',
    '@modernshoppingcenter',
    '@qnashcom',
    '@classybrands',
    '@marakibrand',
    '@aradabrand2'
]

# Create directories for storing data
DATA_DIR = Path('data')
IMAGES_DIR = DATA_DIR / 'images'
CSV_OUTPUT = DATA_DIR / 'telegram_messages.csv'

# Create directories if they don't exist
DATA_DIR.mkdir(exist_ok=True)
IMAGES_DIR.mkdir(exist_ok=True)

print(f"📁 Data will be stored in: {DATA_DIR}")
print(f"🖼️ Images will be stored in: {IMAGES_DIR}")


📁 Data will be stored in: data
🖼️ Images will be stored in: data/images


In [5]:
def clean_amharic_text(text):
    """
    Clean and normalize Amharic text
    This function removes unnecessary characters and normalizes the text
    """
    if not text:
        return ""

    # Remove excessive emojis and special characters
    # Keep Amharic characters, numbers, and basic punctuation
    cleaned_text = re.sub(r'[^\u1200-\u137F\u0000-\u007F\s]', ' ', text)

    # Remove multiple spaces and normalize whitespace
    cleaned_text = ' '.join(cleaned_text.split())

    # Remove excessive dots and special characters
    cleaned_text = re.sub(r'\.{3,}', '...', cleaned_text)
    cleaned_text = re.sub(r'💥{2,}', '💥', cleaned_text)

    return cleaned_text.strip()

def extract_metadata(message):
    """
    Extract useful metadata from Telegram message
    Returns a dictionary with structured information
    """
    return {
        'message_id': message.id,
        'timestamp': message.date.isoformat() if message.date else None,
        'views': getattr(message, 'views', 0),
        'sender_id': message.from_id.user_id if message.from_id else None,
        'is_forwarded': message.fwd_from is not None,
        'has_media': message.media is not None
    }

def format_timestamp(dt):
    """
    Format datetime object to match the required output format
    """
    if dt:
        return dt.strftime('%Y-%m-%d %H:%M:%S+00:00')
    return None

In [6]:
class EthioMartTelegramScraper:
    """
    Main class for scraping Telegram channels
    This handles all the telegram API interactions
    """

    def __init__(self, api_id, api_hash, phone_number):
        self.api_id = api_id
        self.api_hash = api_hash
        self.phone_number = phone_number
        self.client = None
        self.scraped_data = []

    async def initialize_client(self):
        """
        Initialize and authenticate the Telegram client
        """
        print("🔄 Initializing Telegram client...")

        # Create client with session file
        self.client = TelegramClient('ethiomart_session', self.api_id, self.api_hash)

        # Start the client
        await self.client.start(phone=self.phone_number)

        print("✅ Telegram client initialized successfully!")

        # Check if we're authorized
        if await self.client.is_user_authorized():
            print("✅ User is authorized!")
        else:
            print("❌ User is not authorized. Please check your credentials.")

    async def scrape_channel(self, channel_username, limit=100):
        """
        Scrape messages from a specific channel

        Args:
            channel_username: The channel username (e.g., '@channelname')
            limit: Maximum number of messages to fetch
        """
        print(f"🔄 Scraping channel: {channel_username}")

        try:
            # Get the channel entity
            channel = await self.client.get_entity(channel_username)
            print(f"📢 Found channel: {channel.title}")

            # Counter for messages processed
            message_count = 0

            # Iterate through messages
            async for message in self.client.iter_messages(channel, limit=limit):
                try:
                    # Extract basic message data
                    message_data = {
                        'message_id': message.id,
                        'channel': channel_username,
                        'text': clean_amharic_text(message.text) if message.text else '',
                        'timestamp': format_timestamp(message.date),
                        'views': getattr(message, 'views', 0) or 0,
                        'sender': getattr(message.from_id, 'user_id', channel.id) if message.from_id else channel.id,
                        'image_path': ''
                    }

                    # Handle media (images)
                    if message.media and hasattr(message.media, 'photo'):
                        image_filename = f"telegram_image_{channel_username.replace('@', '')}_{message.id}.jpg"
                        image_path = IMAGES_DIR / image_filename

                        try:
                            # Download the image
                            await self.client.download_media(message.media, file=str(image_path))
                            message_data['image_path'] = f"../data/images/{image_filename}"
                            print(f"📷 Downloaded image: {image_filename}")
                        except Exception as e:
                            print(f"⚠️ Failed to download image: {e}")

                    # Add to our dataset
                    self.scraped_data.append(message_data)
                    message_count += 1

                    # Show progress every 10 messages
                    if message_count % 10 == 0:
                        print(f"📊 Processed {message_count} messages from {channel_username}")

                except Exception as e:
                    print(f"⚠️ Error processing message {message.id}: {e}")
                    continue

            print(f"✅ Completed scraping {channel_username}: {message_count} messages")

        except Exception as e:
            print(f"❌ Error scraping channel {channel_username}: {e}")

    async def scrape_all_channels(self, message_limit=100):
        """
        Scrape all configured channels

        Args:
            message_limit: Maximum messages per channel
        """
        print(f"🚀 Starting to scrape {len(TELEGRAM_CHANNELS)} channels...")

        for channel in TELEGRAM_CHANNELS:
            await self.scrape_channel(channel, limit=message_limit)

            # Add a small delay between channels to be respectful
            print("⏳ Waiting 2 seconds before next channel...")
            await asyncio.sleep(2)

        print("🎉 All channels scraped successfully!")

    def save_to_csv(self, filename=None):
        """
        Save scraped data to CSV file
        """
        if not filename:
            filename = CSV_OUTPUT

        if not self.scraped_data:
            print("⚠️ No data to save!")
            return

        # Create DataFrame
        df = pd.DataFrame(self.scraped_data)

        # Ensure proper column order to match the required output
        columns_order = ['message_id', 'channel', 'text', 'timestamp', 'views', 'sender', 'image_path']
        df = df[columns_order]

        # Save to CSV
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"💾 Data saved to: {filename}")
        print(f"📊 Total messages saved: {len(df)}")

        # Show sample of the data
        print("\n📋 Sample data:")
        print(df.head())

        return df

    async def close(self):
        """
        Close the Telegram client connection
        """
        if self.client:
            await self.client.disconnect()
            print("🔌 Telegram client disconnected")

In [7]:
async def main():
    """
    Main function to run the scraping process
    """
    print("🎯 EthioMart Telegram Data Ingestion System")
    print("=" * 50)

    # Initialize scraper
    scraper = EthioMartTelegramScraper(API_ID, API_HASH, PHONE_NUMBER)

    try:
        # Initialize client
        await scraper.initialize_client()

        # Scrape all channels (adjust limit as needed)
        await scraper.scrape_all_channels(message_limit=5000)  # Increased to scrape around 30000 messages

        # Save data to CSV
        df = scraper.save_to_csv()

        print("\n🎉 Data ingestion completed successfully!")
        print(f"📊 Total messages collected: {len(scraper.scraped_data)}")
        print(f"📁 Data saved to: {CSV_OUTPUT}")

    except Exception as e:
        print(f"❌ Error during execution: {e}")

    finally:
        # Always close the client
        await scraper.close()

In [8]:

def analyze_collected_data(csv_file=CSV_OUTPUT):
    """
    Analyze the collected data and show statistics
    """
    if not os.path.exists(csv_file):
        print(f"❌ CSV file not found: {csv_file}")
        return

    # Load the data
    df = pd.read_csv(csv_file)

    print("📊 DATA ANALYSIS REPORT")
    print("=" * 30)
    print(f"Total messages: {len(df)}")
    print(f"Channels scraped: {df['channel'].nunique()}")
    print(f"Messages with images: {len(df[df['image_path'] != ''])}")
    print(f"Messages with text: {len(df[df['text'] != ''])}")

    print("\n📈 Messages per channel:")
    channel_counts = df['channel'].value_counts()
    for channel, count in channel_counts.items():
        print(f"  {channel}: {count} messages")

    print("\n📋 Sample messages:")
    print(df[['channel', 'text', 'timestamp']].head())

    return df

In [9]:
# Run the main scraping function
await main()

🎯 EthioMart Telegram Data Ingestion System
🔄 Initializing Telegram client...
Signed in successfully as Ruhama; remember to not break the ToS or you will risk an account ban!
✅ Telegram client initialized successfully!
✅ User is authorized!
🚀 Starting to scrape 6 channels...
🔄 Scraping channel: @helloomarketethiopia
📢 Found channel: HellooMarket
📷 Downloaded image: telegram_image_helloomarketethiopia_4562.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4561.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4560.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4559.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4558.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4557.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4556.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4555.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4554.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_4553.jpg
📊 P



📷 Downloaded image: telegram_image_helloomarketethiopia_3287.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3286.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3285.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3284.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3283.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3282.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3281.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3280.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3279.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3278.jpg
📊 Processed 1240 messages from @helloomarketethiopia
📷 Downloaded image: telegram_image_helloomarketethiopia_3277.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3276.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3275.jpg
📷 Downloaded image: telegram_image_helloomarketethiopia_3274.jpg
📷 Downloaded image: telegram_image_he



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
📷 Downloaded image: telegram_image_modernshoppingcenter_4903.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4902.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4901.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4900.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4899.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4897.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4896.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4895.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4894.jpg
📊 Processed 1620 messages from @modernshoppingcenter
📷 Downloaded image: telegram_image_modernshoppingcenter_4893.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4891.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4889.jpg
📷 Downloaded image: telegram_image_modernshoppingcenter_4888.jpg
📷 Downloaded image: telegram_image_mo



📷 Downloaded image: telegram_image_qnashcom_1817.jpg
📷 Downloaded image: telegram_image_qnashcom_1816.jpg
📷 Downloaded image: telegram_image_qnashcom_1815.jpg
📷 Downloaded image: telegram_image_qnashcom_1812.jpg
📊 Processed 2690 messages from @qnashcom
📷 Downloaded image: telegram_image_qnashcom_1806.jpg
📷 Downloaded image: telegram_image_qnashcom_1804.jpg
📷 Downloaded image: telegram_image_qnashcom_1802.jpg
📷 Downloaded image: telegram_image_qnashcom_1801.jpg
📷 Downloaded image: telegram_image_qnashcom_1800.jpg
📊 Processed 2700 messages from @qnashcom
📷 Downloaded image: telegram_image_qnashcom_1798.jpg
📷 Downloaded image: telegram_image_qnashcom_1796.jpg
📷 Downloaded image: telegram_image_qnashcom_1787.jpg
📷 Downloaded image: telegram_image_qnashcom_1786.jpg
📷 Downloaded image: telegram_image_qnashcom_1784.jpg
📷 Downloaded image: telegram_image_qnashcom_1783.jpg
📊 Processed 2710 messages from @qnashcom
📷 Downloaded image: telegram_image_qnashcom_1781.jpg
📷 Downloaded image: telegram_



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
📷 Downloaded image: telegram_image_marakibrand_8211.jpg
📷 Downloaded image: telegram_image_marakibrand_8210.jpg
📷 Downloaded image: telegram_image_marakibrand_8209.jpg
📷 Downloaded image: telegram_image_marakibrand_8208.jpg
📷 Downloaded image: telegram_image_marakibrand_8207.jpg
📷 Downloaded image: telegram_image_marakibrand_8206.jpg
📷 Downloaded image: telegram_image_marakibrand_8205.jpg
📊 Processed 3260 messages from @marakibrand
📷 Downloaded image: telegram_image_marakibrand_8204.jpg
📷 Downloaded image: telegram_image_marakibrand_8203.jpg
📷 Downloaded image: telegram_image_marakibrand_8202.jpg
📷 Downloaded image: telegram_image_marakibrand_8201.jpg
📷 Downloaded image: telegram_image_marakibrand_8200.jpg
📷 Downloaded image: telegram_image_marakibrand_8199.jpg
📷 Downloaded image: telegram_image_marakibrand_8197.jpg
📷 Downloaded image: telegram_image_marakibrand_8196.jpg
📷 Downloaded image: telegram_image_marakibrand_8195



📷 Downloaded image: telegram_image_aradabrand2_9285.jpg
📷 Downloaded image: telegram_image_aradabrand2_9284.jpg
📷 Downloaded image: telegram_image_aradabrand2_9283.jpg
📷 Downloaded image: telegram_image_aradabrand2_9282.jpg
📷 Downloaded image: telegram_image_aradabrand2_9281.jpg
📷 Downloaded image: telegram_image_aradabrand2_9280.jpg
📷 Downloaded image: telegram_image_aradabrand2_9279.jpg
📷 Downloaded image: telegram_image_aradabrand2_9278.jpg
📊 Processed 2870 messages from @aradabrand2
📷 Downloaded image: telegram_image_aradabrand2_9277.jpg
📷 Downloaded image: telegram_image_aradabrand2_9276.jpg
📷 Downloaded image: telegram_image_aradabrand2_9275.jpg
📷 Downloaded image: telegram_image_aradabrand2_9272.jpg
📷 Downloaded image: telegram_image_aradabrand2_9268.jpg
📷 Downloaded image: telegram_image_aradabrand2_9267.jpg
📷 Downloaded image: telegram_image_aradabrand2_9266.jpg
📷 Downloaded image: telegram_image_aradabrand2_9265.jpg
📷 Downloaded image: telegram_image_aradabrand2_9261.jpg
📷 Do



📷 Downloaded image: telegram_image_aradabrand2_9002.jpg
📷 Downloaded image: telegram_image_aradabrand2_9001.jpg
📷 Downloaded image: telegram_image_aradabrand2_9000.jpg
📷 Downloaded image: telegram_image_aradabrand2_8999.jpg
📷 Downloaded image: telegram_image_aradabrand2_8998.jpg
📊 Processed 3070 messages from @aradabrand2
📷 Downloaded image: telegram_image_aradabrand2_8997.jpg
📷 Downloaded image: telegram_image_aradabrand2_8996.jpg
📷 Downloaded image: telegram_image_aradabrand2_8995.jpg
📷 Downloaded image: telegram_image_aradabrand2_8994.jpg
📷 Downloaded image: telegram_image_aradabrand2_8993.jpg
📷 Downloaded image: telegram_image_aradabrand2_8980.jpg
📷 Downloaded image: telegram_image_aradabrand2_8979.jpg
📷 Downloaded image: telegram_image_aradabrand2_8978.jpg
📷 Downloaded image: telegram_image_aradabrand2_8974.jpg
📷 Downloaded image: telegram_image_aradabrand2_8972.jpg
📊 Processed 3080 messages from @aradabrand2
📷 Downloaded image: telegram_image_aradabrand2_8971.jpg
📷 Downloaded ima

In [11]:
# Analyze the collected data to confirm it works
df = analyze_collected_data()

📊 DATA ANALYSIS REPORT
Total messages: 27520
Channels scraped: 6
Messages with images: 27520
Messages with text: 27520

📈 Messages per channel:
  @modernshoppingcenter: 5000 messages
  @classybrands: 5000 messages
  @aradabrand2: 5000 messages
  @marakibrand: 5000 messages
  @qnashcom: 3911 messages
  @helloomarketethiopia: 3609 messages

📋 Sample messages:
                 channel                                               text  \
0  @helloomarketethiopia  **ሁለት **ሲገዙ **5%** ቅናሽ ሙሉ በሙሉ የመኝታ ቤትዎን ውበት የሚ...   
1  @helloomarketethiopia  የልጆች ወብ ባማረ ዲዛይን የተሰራ ቀሚስ ለማዘዝ 0974312223 ይደውሉ...   
2  @helloomarketethiopia  አስደናቄ አትዮጵያዊ ምስልሎች ያላቸው የአረቄ መለኪያ ለማዘዝ 0974312...   
3  @helloomarketethiopia  ብዙ ተወዳጀነትን የተረፈ የቃልኪዳን ጉዞ ለሁሉም እድሜ የሚሆን የአማረኛ ...   
4  @helloomarketethiopia  ለቤትዎ ደምቀት የሚጨምሩ ሻማዎች በተለያየ መጠን እና ቀለም ለማዘዝ 097...   

                   timestamp  
0  2025-05-23 13:34:44+00:00  
1  2025-05-16 13:18:16+00:00  
2  2025-05-12 13:45:37+00:00  
3  2025-05-10 09:47:16+00:00  
4  2025-05