In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm 

# Load Data and Clean Text

In [2]:
def load_content_json(file_path):
    """Load a content.json file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_text(sections):
    """Extract and clean text from the 'sections' field."""
    text = ""
    for section in sections:
        if section["type"] == "text":
            text += section["content"] + "\n"
    return text.strip()

def extract_images(images):
    """Extract image metadata."""
    image_paths = []
    for image in images:
        if "local_path" in image:
            image_paths.append(image["local_path"])
    return image_paths

def extract_pdfs(pdfs):
    """Extract PDF metadata."""
    pdf_paths = []
    for pdf in pdfs:
        if "local_path" in pdf:
            pdf_paths.append(pdf["local_path"])
    return pdf_paths

def extract_tables(tables):
    """Extract and convert tables to structured text (markdown format)."""
    table_texts = []
    for table in tables:
        # Convert table to markdown format
        markdown_table = "| " + " | ".join(table["headers"]) + " |\n"
        markdown_table += "| " + " | ".join(["---"] * len(table["headers"])) + " |\n"
        for row in table["rows"]:
            markdown_table += "| " + " | ".join(row) + " |\n"
        table_texts.append(markdown_table.strip())
    return table_texts

In [3]:
def preprocess_data(base_dir):
    """Preprocess data from content.json files and return a DataFrame."""
    # Initialize a list to store processed data
    data = []

    # Walk through the directory structure
    for root, dirs, files in tqdm(os.walk(base_dir), desc="Processing folders"):
        for file in files:
            if file == "content.json":
                file_path = os.path.join(root, file)
                
                # Load the content.json file
                content = load_content_json(file_path)
                
                # Extract text
                text = extract_text(content["content"]["sections"])
                
                # Extract images
                image_paths = extract_images(content["content"]["images"])
                
                # Extract PDFs
                pdf_paths = extract_pdfs(content["content"]["pdfs"])
                
                # Extract tables
                tables = extract_tables(content["content"]["tables"])
                
                # Extract metadata
                metadata = {
                    "title": content["metadata"]["title"],
                    "url": content["metadata"]["url"],
                    "last_updated": content["metadata"]["last_updated"],
                    "extracted_at": content["metadata"]["extracted_at"]
                }
                
                # Append to the data list
                data.append({
                    "document_id": os.path.basename(root),  # Use folder name as document ID
                    "text": text,
                    "image_paths": image_paths,
                    "pdf_paths": pdf_paths,
                    "tables": tables,
                    "metadata": metadata
                })

    # Convert the data list to a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

In [None]:
# Define the base directory where content.json files are stored
base_dir = os.getcwd()

# Preprocess the data
df = preprocess_data(base_dir)

# Display the first few rows of the DataFrame
df.head()

Processing folders: 4973it [00:00, 17457.94it/s]


Unnamed: 0,document_id,text,image_paths,pdf_paths,tables,metadata
0,c48783d6d76b9379871904763cacdbf0,Question:\nDoes RH850 have a different instruc...,[],[],[| |\n| |\n| RH850/F1x |],{'title': 'Does RH850 have a different instruc...
1,856760f77c114b8887aaba975f1b24dc,Question:\nHow do I register for a MyRenesas a...,[],[],[| |\n| |\n| RH850/F1x |],{'title': 'RH850/F1x: How do I register for a ...
2,054c47ab3ab556189283c9a4b75af90f,Question:\nWhere can I find FIT/MTBF and Diagn...,[],[],[| |\n| |\n| RL78 |],"{'title': 'Reliability report, FIT/MTBF for RH..."
3,f95dbeb26c41257522a18c8f6463c248,Question:\nI am unable to find rh850 device fi...,[data/categories/rh850_family/rh850_general/f9...,[],[| |\n| |\n| RH850 |],"{'title': 'RH850 find dvf file for CS+', 'url'..."
4,c9853fd939b403fdf8979c36a7fddc1d,Question:\nI couldn't find a possible way to e...,[],[],[| |\n| |\n| RH850 |],"{'title': 'Nested Interrupt in RH850', 'url': ..."


In [10]:
# Filter rows where image_paths is not empty
df_images = df[df['image_paths'].apply(lambda x: len(x) > 0)].reset_index(drop=True)

# Display the first few rows of df_images
print("DataFrame with Images (df_images):")
df_images.head()

DataFrame with Images (df_images):


Unnamed: 0,document_id,text,image_paths,pdf_paths,tables,metadata
0,f95dbeb26c41257522a18c8f6463c248,Question:\nI am unable to find rh850 device fi...,[data/categories/rh850_family/rh850_general/f9...,[],[| |\n| |\n| RH850 |],"{'title': 'RH850 find dvf file for CS+', 'url'..."
1,85b897523834211bbbc3b0fe00a067a4,Question:\nI am using RH850 with CS+ IDE. CC-R...,[data/categories/rh850_family/rh850_general/85...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: CC-RH Execution of a routine...
2,aef18c1527e32753e293578beff82f85,Question:\nHow do I dump the RH850 whole IOR r...,[data/categories/rh850_family/rh850_general/ae...,[],[| |\n| |\n| RH850 |],"{'title': 'RH850 IOR dump in CS+', 'url': 'htt..."
3,3cf13711a6bd31149acc5c5f5f248702,Question:\nWe have a programming production li...,[data/categories/rh850_family/rh850_general/3c...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: Programmer change from E1 to...
4,0becf452098654551210915e6bf307b8,Question:\nI am using RH850 device with CS+. I...,[data/categories/rh850_family/rh850_general/0b...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: Prevent variable from updati...


In [11]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   document_id  152 non-null    object
 1   text         152 non-null    object
 2   image_paths  152 non-null    object
 3   pdf_paths    152 non-null    object
 4   tables       152 non-null    object
 5   metadata     152 non-null    object
dtypes: object(6)
memory usage: 7.2+ KB


In [19]:
df_images['image_paths'][0]

['data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/8e30aea5eface8fc259d997da34bfae4.png',
 'data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/b50f0e66167a52551bbe298e915639da.png']

In [None]:
df_images['image'][0]

In [14]:
import re

def clean_text(text):
    """
    Clean the text while preserving stopwords and sentence structure.
    """
    # Remove duplicate lines
    lines = text.split('\n')
    unique_lines = list(dict.fromkeys(lines))  # Preserves order
    text = '\n'.join(unique_lines)
    
    # Remove extra whitespace (spaces, tabs, newlines)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove special characters (optional, customize based on your needs)
    # Here, we keep alphanumeric, basic punctuation, and common symbols
    text = re.sub(r'[^a-zA-Z0-9\s.,;!?\-()/\'"]', '', text)
    
    return text

# Apply the cleaning function to the 'text' column in df_images
df_images['cleaned_text'] = df_images['text'].apply(clean_text)

# Display the first few rows of the DataFrame with cleaned text
print("DataFrame with Cleaned Text (Preserving Stopwords):")
df_images[['text', 'cleaned_text']].head()

DataFrame with Cleaned Text (Preserving Stopwords):


Unnamed: 0,text,cleaned_text
0,Question:\nI am unable to find rh850 device fi...,Question I am unable to find rh850 device file...
1,Question:\nI am using RH850 with CS+ IDE. CC-R...,Question I am using RH850 with CS IDE. CC-RH's...
2,Question:\nHow do I dump the RH850 whole IOR r...,Question How do I dump the RH850 whole IOR reg...
3,Question:\nWe have a programming production li...,Question We have a programming production line...
4,Question:\nI am using RH850 device with CS+. I...,Question I am using RH850 device with CS. Is i...


In [54]:
df_images.to_csv('Data/processed_data_images.csv', index=False, header=True)

# Use Vision Model for Image Description

In [70]:
import os
import io
import boto3
import pandas as pd
from PIL import Image

# Initialize Bedrock client
bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name=os.getenv("AWS_DEFAULT_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)

def prepare_image(image_path, max_size):
    """Resize and convert an image to bytes."""
    try:
        image = Image.open(image_path)
        image.thumbnail((max_size, max_size))
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        return buffer.getvalue()
    except Exception:
        return None

def analyze_image(image_bytes, prompt):
    """Analyze an image using AWS Bedrock."""
    try:
        inference_config = {"maxTokens": 4096, "temperature": 0, "topP": 0.1}
        messages = [{
            "role": "user",
            "content": [
                {"text": prompt},
                {"image": {"format": "png", "source": {"bytes": image_bytes}}}
            ]
        }]
        response = bedrock_runtime.converse(
            modelId=os.getenv("BEDROCK_MODEL_ID"),
            messages=messages,
            inferenceConfig=inference_config
        )
        return response["output"]["message"]["content"][0]["text"]
    except Exception:
        return None

def process_images_in_row(index, image_paths, base_dir):
    """Process all images in a row and return descriptions."""
    print(f"Processing row {index} with {len(image_paths)} images...")
    descriptions = []
    prompt = "Describe the content of this image."
    
    for image_path in image_paths:
        full_image_path = os.path.join(base_dir, image_path)
        print(f"  - Processing image: {full_image_path}")  # Show image being processed
        
        if os.path.exists(full_image_path):
            image_bytes = prepare_image(full_image_path, 1120)
            if image_bytes:
                description = analyze_image(image_bytes, prompt)
                if description:
                    descriptions.append(description)
    
    print(f"  ✅ Completed row {index}\n")
    return descriptions

def process_dataframe(df, base_dir):
    """Process the entire DataFrame."""
    df["image_descriptions"] = None  # Ensure column exists
    for index, row in df.iterrows():
        df.at[index, "image_descriptions"] = process_images_in_row(index, row["image_paths"], base_dir)
    return df

# Define base directory
BASE_DIR = os.path.join(os.getcwd(), "articleCrawl")

# Process DataFrame
df_images_descriptions = process_dataframe(df_images, BASE_DIR)

# Display summary without full descriptions
print("\nProcessing complete! DataFrame shape:", df_images_descriptions.shape)
print(df_images_descriptions[["document_id", "image_paths"]].head())  # Show only metadata

Processing row 0 with 2 images...
  - Processing image: /Users/ramfeuji/Documents/Vettura AI/Vettura-Capstone/articleCrawl/data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/8e30aea5eface8fc259d997da34bfae4.png
  - Processing image: /Users/ramfeuji/Documents/Vettura AI/Vettura-Capstone/articleCrawl/data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/b50f0e66167a52551bbe298e915639da.png
  ✅ Completed row 0

Processing row 1 with 2 images...
  - Processing image: /Users/ramfeuji/Documents/Vettura AI/Vettura-Capstone/articleCrawl/data/categories/rh850_family/rh850_general/85b897523834211bbbc3b0fe00a067a4/images/198d45ba348f46df5f05c47eb602ca1e.png
  - Processing image: /Users/ramfeuji/Documents/Vettura AI/Vettura-Capstone/articleCrawl/data/categories/rh850_family/rh850_general/85b897523834211bbbc3b0fe00a067a4/images/b1a996cd92b432d64e4c1a7869df6e50.png
  ✅ Completed row 1

Processing row 2 with 1 images...
  - Processing image

In [71]:
df_images_descriptions.head()

Unnamed: 0,document_id,text,image_paths,pdf_paths,tables,metadata,cleaned_text,image_descriptions
0,f95dbeb26c41257522a18c8f6463c248,Question:\nI am unable to find rh850 device fi...,[data/categories/rh850_family/rh850_general/f9...,[],[| |\n| |\n| RH850 |],"{'title': 'RH850 find dvf file for CS+', 'url'...",Question I am unable to find rh850 device file...,[The image shows a computer window with the ti...
1,85b897523834211bbbc3b0fe00a067a4,Question:\nI am using RH850 with CS+ IDE. CC-R...,[data/categories/rh850_family/rh850_general/85...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: CC-RH Execution of a routine...,Question I am using RH850 with CS IDE. CC-RH's...,[The image shows a screenshot of a computer pr...
2,aef18c1527e32753e293578beff82f85,Question:\nHow do I dump the RH850 whole IOR r...,[data/categories/rh850_family/rh850_general/ae...,[],[| |\n| |\n| RH850 |],"{'title': 'RH850 IOR dump in CS+', 'url': 'htt...",Question How do I dump the RH850 whole IOR reg...,[The image shows a screenshot of a computer sc...
3,3cf13711a6bd31149acc5c5f5f248702,Question:\nWe have a programming production li...,[data/categories/rh850_family/rh850_general/3c...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: Programmer change from E1 to...,Question We have a programming production line...,[The image shows a screenshot of the Renesas F...
4,0becf452098654551210915e6bf307b8,Question:\nI am using RH850 device with CS+. I...,[data/categories/rh850_family/rh850_general/0b...,[],[| |\n| |\n| RH850 |],{'title': 'RH850: Prevent variable from updati...,Question I am using RH850 device with CS. Is i...,[The image shows a screenshot of a computer pr...


In [72]:
df_images_descriptions['image_descriptions'][0]

['The image shows a computer window with the title "Renesas Options" at the top left. The window is divided into two sections: "Property" and "Value". The "Property" section lists various options, including "Configuration", "Emulator", "CPU", "CPU Family", "DeviceFile", "Reset Type", "Stop CPU activities when stopped", "PinMasking", "RH850", "Debug Interface", "LPD Mode Debug Speed", "Oscillator Frequency", "PLL Multiplication Ratio", and "UnlockID". The "Value" section displays the corresponding values for each option, including "Renesas RH850", "RH850 dr7701024.dvi", "NORMAL", "LPD 4-Wire", "4000", "8000", "0", and "RL78".\n\nAt the bottom of the window, there are two buttons: "OK" and "Cancel". The background of the window is light gray, with a darker gray border around the edges. Overall, the image appears to be a screenshot of a computer program or software interface, possibly related to microcontrollers or embedded systems.',
 'The image shows a screenshot of a computer program, 

In [73]:
df_images_descriptions['image_paths'][0]

['data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/8e30aea5eface8fc259d997da34bfae4.png',
 'data/categories/rh850_family/rh850_general/f95dbeb26c41257522a18c8f6463c248/images/b50f0e66167a52551bbe298e915639da.png']

In [74]:
df_images_descriptions['metadata'][0]

{'title': 'RH850 find dvf file for CS+',
 'url': 'https://en-support.renesas.com/knowledgeBase/21153486',
 'last_updated': None,
 'extracted_at': '2025-03-09T00:29:11.225703'}

In [75]:
df_images_descriptions.to_csv('Data/processed_data_images_descriptions.csv', index=False, header=True)