In [24]:
import os
import sys
import asyncio
import json
import pandas as pd

sys.path.append(os.path.abspath(os.path.join('..')))
from src.scraper import fetch_messages, save_messages_to_json

import nest_asyncio
nest_asyncio.apply()

In [25]:
CHANNELS = [
    "@ZemenExpress",
    "@nevacomputer",
    "@meneshayeofficial",
    "@ethio_brand_collection",
    "@Leyueqa",
    "@sinayelj",
    "@Shewabrand",
    "@helloomarketethiopia",
    "@modernshoppingcenter",
    "@qnashcom",
    "@Fashiontera",
    "@kuruwear",
    "@gebeyaadama",
    "@MerttEka",
    "@forfreemarket",
    "@classybrands",
    "@marakibrand",
    "@aradabrand2",
    "@marakisat2",
    "@belaclassic",
    "@AwasMart",
    "@qnashcom"
]

RAW_DATA_PATH = '../data/raw_scraped/'

for channel in CHANNELS:
    print("-" * 20)
    messages = asyncio.run(fetch_messages(channel, limit=500))
    if messages:
        output_file = os.path.join(RAW_DATA_PATH, f"{channel}.json")
        save_messages_to_json(messages, output_file)

print("\nScraping complete.")

--------------------
Fetching messages from Zemen Express®...
Fetched 210 messages.
Saved messages to ../data/raw_scraped/@ZemenExpress.json
--------------------
Fetching messages from NEVA COMPUTER®...
Fetched 150 messages.
Saved messages to ../data/raw_scraped/@nevacomputer.json
--------------------
Fetching messages from መነሻዬ...
Fetched 469 messages.
Saved messages to ../data/raw_scraped/@meneshayeofficial.json
--------------------
Fetching messages from EthioBrand®...
Fetched 499 messages.
Saved messages to ../data/raw_scraped/@ethio_brand_collection.json
--------------------
Fetching messages from ልዩ እቃ...
Fetched 272 messages.
Saved messages to ../data/raw_scraped/@Leyueqa.json
--------------------
Fetching messages from SINA KIDS/ሲና ኪድስⓇ...
Fetched 122 messages.
Saved messages to ../data/raw_scraped/@sinayelj.json
--------------------
Fetching messages from Shewa Brand...
Fetched 424 messages.
Saved messages to ../data/raw_scraped/@Shewabrand.json
--------------------
Fetching m

In [30]:
all_data = []
for channel in CHANNELS:
    file_path = os.path.join(RAW_DATA_PATH, f"{channel}.json")
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_data.extend(data)

df = pd.DataFrame(all_data)

# --- Basic Preprocessing ---
# 1. Drop duplicates just in case
df.drop_duplicates(subset=['text'], inplace=True)

# 2. Remove rows with very short text
df = df[df['text'].str.len() > 15]

# 3. Simple text cleaning
df['clean_text'] = df['text'].str.replace('\n', ' ').str.replace(r'#\w+', '', regex=True).str.strip()


# Save the processed data
PROCESSED_DATA_PATH = '../data/processed/consolidated_messages.csv'
df.to_csv(PROCESSED_DATA_PATH, index=False)

print(f"Consolidated and saved {len(df)} messages to {PROCESSED_DATA_PATH}")
df.head()

Consolidated and saved 4366 messages to ../data/processed/consolidated_messages.csv


Unnamed: 0,channel,message_id,text,date,views,clean_text
0,@ZemenExpress,7007,💥💥...................................💥💥\n\n🎯 L...,2025-06-24T11:49:18+00:00,2577,💥💥...................................💥💥 🎯 LCD...
3,@ZemenExpress,7004,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,2025-06-23T14:55:46+00:00,2827,💥💥👀 ...........💥💥 📌 Electric Charcoal Burner ...
6,@ZemenExpress,6995,💥💥...................................💥💥\n\n📌Fo...,2025-06-23T08:23:14+00:00,3114,💥💥...................................💥💥 📌Food...
7,@ZemenExpress,6991,💥💥...................................💥💥\n\n📌Sa...,2025-06-21T16:35:51+00:00,3688,💥💥...................................💥💥 📌Saac...
8,@ZemenExpress,6987,💥💥...................................💥💥\n\n3pc...,2025-06-21T08:07:31+00:00,3604,💥💥...................................💥💥 3pcs ...
