In [4]:
# notebooks/01_data_exploration.ipynb

# ---  Import Libraries and Set Up Paths ---
import os
import json
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Adjust path based on notebook's location relative to project root
# If this notebook is in 'notebooks/', then '..' goes to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Import paths from config.py (ensure config.py is correctly set up)
# Add project_root to sys.path to allow importing modules from 'scripts'
import sys
if project_root not in sys.path:
    sys.path.append(project_root)
from scripts.config import RAW_MESSAGES_JSONL, RAW_DATA_DIR, IMAGES_DIR, DOCUMENTS_DIR

print(f"Project Root: {project_root}")
print(f"Raw Messages JSONL: {RAW_MESSAGES_JSONL}")
print(f"Images Directory: {IMAGES_DIR}")
print(f"Documents Directory: {DOCUMENTS_DIR}")

Project Root: d:\10academy\week_4_building_an_amharic_e_commerce_data_extractor
Raw Messages JSONL: d:\10academy\week_4_building_an_amharic_e_commerce_data_extractor\data\raw\telegram_messages.jsonl
Images Directory: d:\10academy\week_4_building_an_amharic_e_commerce_data_extractor\data\raw\images
Documents Directory: d:\10academy\week_4_building_an_amharic_e_commerce_data_extractor\data\raw\documents


In [6]:
# ---  Load Raw Scraped Data ---
print("Loading raw scraped messages...")
raw_messages = []
if os.path.exists(RAW_MESSAGES_JSONL):
    with open(RAW_MESSAGES_JSONL, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                raw_messages.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping malformed JSON line: {line.strip()} - Error: {e}")
    df_raw = pd.DataFrame(raw_messages)
    print(f"Loaded {len(df_raw)} messages.")
    if not df_raw.empty:
        print("\nFirst 5 rows of raw data:")
        display(df_raw.head())
        print("\nDataFrame Info:")
        df_raw.info()
    else:
        print("No messages loaded. The JSONL file might be empty or problematic.")
else:
    print(f"Error: {RAW_MESSAGES_JSONL} not found. Please run the data ingestion pipeline first.")
    df_raw = pd.DataFrame() # Create empty DataFrame to avoid errors


Loading raw scraped messages...
Error: d:\10academy\week_4_building_an_amharic_e_commerce_data_extractor\data\raw\telegram_messages.jsonl not found. Please run the data ingestion pipeline first.


In [None]:
# --- : Basic Data Overview ---
if not df_raw.empty:
    print("\nBasic Statistics:")
    print(df_raw.describe(include='all', datetime_is_numeric=True))

    print("\nMissing Values:")
    print(df_raw.isnull().sum())

    print("\nUnique Channels:")
    unique_channels = df_raw['channel_id'].nunique()
    print(f"Number of unique channels scraped: {unique_channels}")

    print("\nMessages per Channel (Top 10):")
    messages_per_channel = df_raw['channel_id'].value_counts().head(10)
    print(messages_per_channel)


# --: Analyze Message Text Length ---
if not df_raw.empty:
    df_raw['text_length'] = df_raw['text'].fillna('').apply(len)
    print("\nText Length Distribution:")
    print(df_raw['text_length'].describe())

    plt.figure(figsize=(10, 6))
    sns.histplot(df_raw['text_length'], bins=50, kde=True)
    plt.title('Distribution of Message Text Lengths')
    plt.xlabel('Text Length (Characters)')
    plt.ylabel('Number of Messages')
    plt.show()


# ---  Analyze Media Presence ---
if not df_raw.empty:
    print("\nMedia Presence:")
    media_counts = df_raw[['has_photo', 'has_document']].sum()
    print(media_counts)

    total_messages = len(df_raw)
    messages_with_photo = df_raw['has_photo'].sum()
    messages_with_document = df_raw['has_document'].sum()
    messages_with_both = df_raw[(df_raw['has_photo'] == True) & (df_raw['has_document'] == True)].shape[0]

    print(f"\nTotal messages: {total_messages}")
    print(f"Messages with photos: {messages_with_photo} ({messages_with_photo/total_messages:.2%})")
    print(f"Messages with documents: {messages_with_document} ({messages_with_document/total_messages:.2%})")
    print(f"Messages with both photo and document: {messages_with_both}")

    # Check actual file existence for a sample
    sample_media_check = df_raw.sample(min(100, len(df_raw)), random_state=42) # Sample up to 100 messages
    existing_images = sample_media_check[sample_media_check['image_path'].notna()]['image_path'].apply(os.path.exists).sum()
    existing_documents = sample_media_check[sample_media_check['document_path'].notna()]['document_path'].apply(os.path.exists).sum()

    print(f"\nSample Check (100 messages) for downloaded media:")
    print(f"Images existing on disk: {existing_images}")
    print(f"Documents existing on disk: {existing_documents}")
    print("Note: 'File reference expired' errors during scraping mean the path exists in JSON but file does not.")

    # Visualize media presence
    media_data = pd.DataFrame({
        'Category': ['Messages with Photo', 'Messages with Document', 'Messages with Both', 'Messages with No Media'],
        'Count': [
            messages_with_photo,
            messages_with_document,
            messages_with_both,
            total_messages - (messages_with_photo + messages_with_document - messages_with_both) # Correct no media count
        ]
    })
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Category', y='Count', data=media_data)
    plt.title('Distribution of Media in Messages')
    plt.ylabel('Number of Messages')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


# --- Time Series Analysis (Messages over time) ---
if not df_raw.empty:
    df_raw['date_obj'] = pd.to_datetime(df_raw['date'])
    df_raw['date_only'] = df_raw['date_obj'].dt.date
    messages_per_day = df_raw['date_only'].value_counts().sort_index()

    plt.figure(figsize=(14, 7))
    messages_per_day.plot(kind='line')
    plt.title('Number of Messages Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Messages')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    print("\nRecent messages (last 7 days):")
    last_7_days = messages_per_day.tail(7)
    print(last_7_days)

    print("\nOldest and Newest Message Dates:")
    print(f"Oldest: {df_raw['date_obj'].min()}")
    print(f"Newest: {df_raw['date_obj'].max()}")


# ---  Sample Messages for Manual Inspection ---
if not df_raw.empty:
    print("\n--- Sample Messages (with text and media status) ---")
    # Sample messages that have text and media
    sample_text_media = df_raw[df_raw['text'].notna() & (df_raw['has_photo'] | df_raw['has_document'])].sample(min(5, len(df_raw)), random_state=1).to_dict('records')
    print("\nMessages with text and media:")
    for msg in sample_text_media:
        print(f"  ID: {msg['message_id']}, Date: {msg['date']}")
        print(f"  Text: {msg['text'][:200]}...") # Print first 200 chars
        print(f"  Has Photo: {msg['has_photo']}, Image Path: {msg['image_path']}")
        print(f"  Has Document: {msg['has_document']}, Document Path: {msg['document_path']}")
        print("-" * 50)

    # Sample messages with only text
    sample_text_only = df_raw[df_raw['text'].notna() & ~df_raw['has_photo'] & ~df_raw['has_document']].sample(min(5, len(df_raw)), random_state=2).to_dict('records')
    print("\nMessages with text only:")
    for msg in sample_text_only:
        print(f"  ID: {msg['message_id']}, Date: {msg['date']}")
        print(f"  Text: {msg['text'][:200]}...")
        print("-" * 50)