In [None]:
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader, TextLoader
import json
import re

def detect_sensitive_data(traffic_data: str):
    """
    Analyze Android app network traffic to detect sensitive data leaks.
    
    :param traffic_data: Raw traffic data (JSON or text).
    :return: A formatted prompt for an LLM to analyze the data.
    """
    # Escape curly braces to prevent formatting errors
    escaped_traffic_data = traffic_data.replace("{", "{{").replace("}", "}}").replace("\"", "\\\"")

    sensitive_data_prompt = PromptTemplate(
        input_variables=["traffic_data"],
        template="""
        You are a cybersecurity expert specializing in **privacy and data security**. 
        Your task is to analyze the outgoing network traffic of an Android application 
        and identify any **sensitive information** that could pose privacy risks.

        ### **Given Traffic Data:**
        ```
        {traffic_data}
        ```

        ### **Your Task:**
        1. Identify any **sensitive data** being transmitted, including but not limited to:
           - **Personal Identifiable Information (PII)**: Email, phone number, IP address, user credentials.
           - **Device Information**: Model, OS version, manufacturer, fingerprint, MCC/MNC.
           - **Authentication Data**: Firebase authentication tokens, API keys, session IDs.
           - **Tracking & Advertising Data**: Firebase installation ID, advertising ID, package name.
           - **Location & Network Data**: Country, locale, timezone, network connection type.
           - **Any other potentially sensitive data** that could lead to privacy risks.

        2. Evaluate the potential **privacy risks** associated with each piece of sensitive data.

        3. Provide **recommendations** on how to mitigate privacy concerns, such as encryption, anonymization, or restricting unnecessary data transmission.

        ---

        ### **Expected JSON Output Format:**
        ```json
        {{
            "sensitive_data_detected": [
                {{
                    "type": "Device ID",
                    "value": "...",
                    "risk": "This can be used to track users across applications."
                }},
                {{
                    "type": "Firebase Authentication Token",
                    "value": "...",
                    "risk": "May allow unauthorized access if leaked."
                }}
            ],
            "recommendations": "The app should encrypt this data before transmission or avoid sending it if unnecessary."
        }}
        ```
        
        If you **cannot detect any sensitive information**, the expected output is:
        ```json
        {{
            "sensitive_data_detected": "no sent-out sensitive data", 
            "recommendations": "The app should encrypt this data before transmission or avoid sending it if unnecessary."
        }}
        ```
        
        Please analyze the given traffic data and return a **structured JSON output** as described.
        """
    )
    
    formatted_prompt = sensitive_data_prompt.format(traffic_data=escaped_traffic_data)
    return formatted_prompt

def load_traffic_data(directory_path: str):
    """
    Load traffic logs from the specified directory.
    
    :param directory_path: Path to the directory containing traffic log files.
    :return: A list of loaded traffic data as strings.
    """
    loader = DirectoryLoader(directory_path, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    
    traffic_logs = []
    for doc in docs:
        content = doc.page_content.strip()
        # Check if the content is JSON
        try:
            traffic_json = json.loads(content)
            traffic_logs.append(json.dumps(traffic_json, indent=2))
        except json.JSONDecodeError:
            # If not JSON, store as raw text
            traffic_logs.append(content)
    
    return traffic_logs

# Example usage
directory_path = r"E:\\wearable-capture-traffic\\wearable-standalone\\todolist.scheduleplanner.dailyplanner.todo.reminders"
traffic_data_list = load_traffic_data(directory_path)
for traffic_data in traffic_data_list:
    print("===================================================================")
    print("traffic_data: ", traffic_data)
    formatted_prompt = detect_sensitive_data(traffic_data)
    print("formatted_prompt: ", formatted_prompt)
#     break
