In [None]:
# Import required libraries
import sys
import asyncio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime

# Add src directory to path for imports
sys.path.append(str(Path('../src')))

# Import our custom modules
from src.data_ingestion import TelegramDataIngestion, ETHIOPIAN_ECOMMERCE_CHANNELS
from src.data_preprocessing import EcommerceDataPreprocessor, AmharicTextPreprocessor
from src.config import PATHS
from src.utils import load_json_file

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("✅ Libraries imported successfully!")
print(f"📊 Channels to analyze: {len(ETHIOPIAN_ECOMMERCE_CHANNELS)}")
print(f"📁 Data directory: {PATHS['data_dir']}")


In [None]:
# Demo: Amharic Text Processing
amharic_processor = AmharicTextPreprocessor()

# Sample Ethiopian e-commerce messages
sample_messages = [
    "አዲስ ስልክ ለሽያጭ ዋጋ 15000 ብር በጣም ጥራት ያለው ማግኘት ይቻላል @mystore",
    "New iPhone 13 for sale, price 25000 ETB, excellent condition! Contact +251911234567",
    "ሽያጭ በርካታ ዕቃዎች አሉ ዋጋ ተመጣጣኝ ነው፣ ለበለጠ መረጃ @shopethiopia ይጠይቁ",
    "Ladies shoes collection ለሴቶች ጫማ ቅናሽ 30% off! ዋጋ 2000-5000 ብር ይጀምራል",
    "Electronics store በአዲስ አበባ መጋዝን ኮምፒዩተሮች፣ ስልኮች እና ተዛማጅ ዕቃዎች አሉ"
]

print("🔤 AMHARIC TEXT PROCESSING DEMO")
print("=" * 50)

for i, text in enumerate(sample_messages, 1):
    print(f"\n📝 Sample Message {i}:")
    print(f"Original: {text}")
    print(f"Is Amharic: {amharic_processor.is_amharic_text(text)}")
    print(f"Normalized: {amharic_processor.normalize_amharic_text(text)}")
    
    # Extract prices
    prices = amharic_processor.extract_prices(text)
    if prices:
        print(f"💰 Prices found: {[p['value'] for p in prices]} ETB")
    
    # Extract contact info
    contact = amharic_processor.extract_contact_info(text)
    if contact['phone_numbers'] or contact['telegram_usernames']:
        print(f"📞 Contact info: {contact}")
    
    # Tokenize
    tokens = amharic_processor.tokenize_amharic(text)
    print(f"🔤 Tokens: {tokens[:10]}...")  # Show first 10 tokens
    print("-" * 40)


In [None]:
# Check for existing scraped data
raw_data_dir = PATHS['raw_data_dir']
processed_data_dir = PATHS['processed_data_dir']

# Look for combined data files
combined_files = list(raw_data_dir.glob("combined_data_*.json"))
processed_files = list(processed_data_dir.glob("processed_*.csv"))

print(f"📊 DATA AVAILABILITY CHECK")
print(f"Raw data files found: {len(combined_files)}")
print(f"Processed data files found: {len(processed_files)}")

if combined_files:
    print(f"\n📁 Latest raw data file: {combined_files[-1].name}")
elif processed_files:
    print(f"\n📁 Latest processed data file: {processed_files[-1].name}")
else:
    print(f"\n⚠️  No scraped data found. To scrape data, run:")
    print(f"   python scripts/telegram_scrapper.py")
    print(f"   or")
    print(f"   python scripts/run_data_ingestion.py")

# Create sample data if no real data is available
if not combined_files and not processed_files:
    print(f"\n🎯 Creating sample data for demonstration...")
    
    # Generate sample data
    sample_data = []
    for i, channel in enumerate(ETHIOPIAN_ECOMMERCE_CHANNELS):
        for j in range(10):  # 10 messages per channel
            sample_data.append({
                'message_id': f"{i}_{j}",
                'channel_title': f"Sample Channel {i+1}",
                'channel_username': channel,
                'text': f"Sample message {j+1} from {channel}",
                'date': datetime.now().isoformat(),
                'views': np.random.randint(10, 1000),
                'forwards': np.random.randint(0, 50),
                'replies': np.random.randint(0, 20),
                'has_media': np.random.choice([True, False]),
                'scraped_at': datetime.now().isoformat()
            })
    
    df_sample = pd.DataFrame(sample_data)
    print(f"✅ Sample dataset created with {len(df_sample)} messages")
    
else:
    # Load actual data if available
    if combined_files:
        latest_file = max(combined_files, key=lambda x: x.stat().st_mtime)
        print(f"\n📊 Loading data from: {latest_file.name}")
        
        with open(latest_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        df_sample = pd.DataFrame(data)
        
    elif processed_files:
        latest_file = max(processed_files, key=lambda x: x.stat().st_mtime)
        print(f"\n📊 Loading processed data from: {latest_file.name}")
        df_sample = pd.read_csv(latest_file)

print(f"\n📈 Dataset shape: {df_sample.shape}")
print(f"🔍 Columns: {list(df_sample.columns)}")
