<a href="https://colab.research.google.com/github/prkrptr/colab_nbs/blob/main/Week_2_Five_V's_of_Big_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# DAT204M - Week 1: Five V's of Big Data
#
# This assignment is designed to help you practice basic data processing
# in Python, a fundamental skill for working with big data. You will work
# with different "types" of data and perform simple analysis.
#
# Remember the five Vs of big data: Volume, Velocity, Veracity, Value, and Variety.
# This assignment focuses on Variety, Veracity, and data processing for Value.
# ==============================================================================

import json
import random
import uuid
from datetime import datetime, timedelta

# --- Part 1: Generating Synthetic Data ---

def generate_structured_data(num_records):
    """
    Generates a list of dictionaries simulating structured event data.
    """
    event_types = ["page_view", "click", "purchase"]
    records = []
    for i in range(num_records):
        user_id = random.randint(1000, 2000)
        event_type = random.choice(event_types)
        timestamp = (datetime.utcnow() - timedelta(seconds=random.randint(0, 3600))).isoformat() + "Z"
        records.append({"user_id": user_id, "event_type": event_type, "timestamp": timestamp})
    return records

def generate_semi_structured_data(num_records):
    """
    Generates a list of JSON-like strings.
    Intentionally includes some malformed data to test Veracity handling.
    """
    content_snippets = [
        "The quick brown fox jumped over the lazy dog.",
        "Python is a powerful programming language.",
        "Big data is characterized by the 5 Vs.",
        "Data can be structured, unstructured, or semi-structured.",
        "The big data ecosystem is complex.",
        "DataOps and governance are important.",
        "AI technologies are used to analyze data.",
    ]
    tags_list = [
        ["animal", "story"],
        ["programming", "python"],
        ["big data", "lecture", "5vs"],
        ["data", "types"],
        ["big data", "ecosystem", "trends"],
        ["data", "governance", "trends"],
        ["AI", "analytics", "learning"],
    ]

    records = []
    for i in range(num_records):
        if i % 10 == 0:  # Introduce a malformed record every 10 records
            records.append('This is not valid JSON.')
            continue

        doc_id = str(uuid.uuid4())
        content = random.choice(content_snippets)
        tags = random.choice(tags_list)
        records.append(json.dumps({"id": doc_id, "content": content, "tags": tags}))
    return records

def generate_unstructured_data(num_records):
    """
    Generates a list of simple sentences.
    """
    lecture_keywords = ["data", "structured", "unstructured", "semi-structured", "volume", "velocity", "variety", "mining", "DataOps", "learning", "AI", "analytics", "trends"]
    sentences = []
    for _ in range(num_records):
        num_keywords = random.randint(1, 3)
        sentence_parts = [random.choice(lecture_keywords) for _ in range(num_keywords)]
        # Add some filler words to make it more like a sentence
        filler_words = ["is", "the", "a", "and", "or", "in", "with", "from"]
        random.shuffle(filler_words)
        combined_words = []
        for word in sentence_parts:
            combined_words.append(word)
            if random.random() > 0.5: # Add a filler word sometimes
                combined_words.append(random.choice(filler_words))

        sentence = " ".join(combined_words)
        sentence = sentence.capitalize() + "."
        sentences.append(sentence)
    return sentences

# --- Part 2: Assignment Tasks ---
# Note: These functions are the same as before and will now operate on much larger datasets.

def count_user_events(data):
    """
    Counts 'click' and 'page_view' events for each user.
    Returns a dictionary with user_id as key and event counts as value.
    Example output: {101: {'page_view': 1, 'click': 2}}
    """
    user_event_counts = {}
    for record in data:
        user_id = record['user_id']
        event_type = record['event_type']
        if event_type in ['click', 'page_view']:
            if user_id not in user_event_counts:
                user_event_counts[user_id] = {'page_view': 0, 'click': 0}
            user_event_counts[user_id][event_type] += 1
    return user_event_counts

def find_multi_tagged_documents(data):
    """
    Safely parses JSON strings and finds documents that have more than one tag.
    Returns a list of dictionaries for each valid document found.
    """
    valid_documents = []
    for record_str in data:
        try:
            record = json.loads(record_str)
            if len(record.get('tags', [])) > 1:
                valid_documents.append({"id": record['id'], "tags": record['tags']})
        except json.JSONDecodeError:
            print(f"Skipping malformed JSON record: '{record_str}'")
    return valid_documents

def find_most_frequent_keyword(data):
    """
    Finds the most frequent keyword from the lecture in the unstructured data.
    Returns a tuple of the keyword and its count.
    Example: ('data', 3)
    """
    keywords = ["data", "structured", "unstructured", "semi-structured", "volume", "velocity", "variety", "mining", "DataOps", "learning", "AI", "analytics"]
    keyword_counts = {keyword: 0 for keyword in keywords}

    for sentence in data:
        words = sentence.lower().split()
        for word in words:
            word = word.replace(",", "").replace(".", "")
            if word in keyword_counts:
                keyword_counts[word] += 1

    most_frequent = ("", 0)
    for keyword, count in keyword_counts.items():
        if count > most_frequent[1]:
            most_frequent = (keyword, count)

    return most_frequent

# --- Part 3: Running the Assignment ---

if __name__ == "__main__":
    # --- Instructions for students ---
    print("--- Big Data Programming Assignment ---")
    print("The datasets are now being generated synthetically.")
    print("This simulates the 'Volume' characteristic of big data.")

    # Generate large datasets (e.g., 1000 records each)
    NUM_RECORDS = 1000
    structured_data = generate_structured_data(NUM_RECORDS)
    semi_structured_data = generate_semi_structured_data(NUM_RECORDS)
    unstructured_data = generate_unstructured_data(NUM_RECORDS)

    print(f"\nSuccessfully generated {len(structured_data)} structured records.")
    print(f"Successfully generated {len(semi_structured_data)} semi-structured records.")
    print(f"Successfully generated {len(unstructured_data)} unstructured records.")

    print("\n--- Your Results ---")

    # Run Task 1 and print the result.
    user_event_counts = count_user_events(structured_data)
    print("Task 1: User event counts for the first 5 users:")
    # Print a small subset to avoid overwhelming the output
    for user_id, counts in list(user_event_counts.items())[:5]:
        print(f"  - User {user_id}: {counts}")
    print(f"(Total unique users: {len(user_event_counts)})")

    # Run Task 2 and print the result.
    multi_tagged_docs = find_multi_tagged_documents(semi_structured_data)
    print("\nTask 2: Valid documents with multiple tags (first 5 found):")
    for doc in multi_tagged_docs[:5]:
        print(f"  - Document ID: {doc['id']}, Tags: {doc['tags']}")
    print(f"(Total valid documents found: {len(multi_tagged_docs)})")

    # Run Task 3 and print the result.
    top_keyword, keyword_count = find_most_frequent_keyword(unstructured_data)
    print(f"\nTask 3: The most frequent keyword is '{top_keyword}' with a count of {keyword_count}.")

    print("\n--- End of Assignment ---")
    print("Feel free to add more test cases or explore the data further!")