In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from util.vision_util import process_vision_info
from tqdm import tqdm
import torch
import json
import pandas as pd
import os

model_dir = "train_output/20240915191425/"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir, torch_dtype="auto", device_map="auto"
).half()  # Use mixed precision if supported

processor = AutoProcessor.from_pretrained(model_dir, padding_side="left")

# Load and preprocess data
with open('output_test.json', 'r') as f:
    data = json.load(f)

messages = []
results = []

for message_obj in data:
    try:
        image_path = message_obj['messages'][0]['content'][0]['image']
        text_instruction = f"Extract only the value of {message_obj['messages'][0]['content'][1]['text'].split(' ')[2]} with its unit from the given image, without any extra details."

        message = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": text_instruction}
            ]
        }]
        messages.append(message)
    except (IndexError, KeyError, ValueError) as e:
        print(f"Skipping message due to error: {e}")

# Load existing results
csv_file = 'result.csv'
if os.path.exists(csv_file):
    processed_df = pd.read_csv(csv_file)
    processed_images = set(processed_df['index'].tolist())
else:
    processed_df = pd.DataFrame(columns=['index', 'prediction'])
    processed_images = set()

def process_batch(batch_messages):
    batch_results = []
    image_inputs, video_inputs = process_vision_info(batch_messages)

    if image_inputs:
        texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
        inputs = processor(
            text=texts,
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to("cpu")

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
            output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)

            for idx, message in enumerate(batch_messages):
                image_path = message[0]['content'][0]['image']
                prediction = output_text[idx]
                batch_results.append({'index': image_path, 'prediction': prediction})

    return batch_results

batch_size = 2
all_results = []

# Define index range to process
x = 0  # Set starting index
y = 25000  # Set ending index (or any specific index you want)

unprocessed_messages = [msg for msg in messages[x:y] if msg[0]['content'][0]['image'] not in processed_images]

for i in tqdm(range(0, len(unprocessed_messages), batch_size), desc="Processing Batches"):
    batch_messages = unprocessed_messages[i:i + batch_size]
    batch_results = process_batch(batch_messages)
    all_results.extend(batch_results)

    if batch_results:
        batch_df = pd.DataFrame(batch_results)
        batch_df.to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False)

df = pd.DataFrame(all_results)


In [None]:
import pandas as pd

# Load your CSV file
csv_file = 'result.csv'  # Replace with your CSV filename
df = pd.read_csv(csv_file)

# Define the function to strip the path and file extension
def modify_index(index_value):
    return index_value.split('/')[-1].replace('.jpg', '')

# Apply the function to the first column (assuming it's named 'index')
df['index'] = df['index'].apply(modify_index)

# Save the modified DataFrame to a new CSV
output_file = 'modified_file.csv'
df.to_csv(output_file, index=False)

print(f"Modified file saved as {output_file}")


In [None]:
import pandas as pd

# Load the CSV files
test_df = pd.read_csv('test.csv')
modified_df = pd.read_csv('modified_file.csv')

# Merge the dataframes based on 'index' column, keeping all rows from test_df
merged_df = pd.merge(test_df[['index']], modified_df[['index', 'prediction']], on='index', how='left')

# Fill NaN values in the 'prediction' column with an empty string
merged_df['prediction'].fillna('', inplace=True)

# Save the final dataframe to a CSV file
merged_df.to_csv('final_output.csv', index=False)


In [None]:
import pandas as pd
import re

# Define a function to extract the numeric value and preserve the unit
def extract_and_preserve(value):
    # Extract the numeric part and the unit from the value
    match = re.match(r"([-+]?\d*\.\d+|\d+)\s*(.*)", str(value))
    if match:
        numeric_value = float(match.group(1))
        unit = match.group(2).strip()
        return f"{numeric_value} {unit}"
    return value  # Return the original value if extraction fails

# Load the CSV file and process it
def process_csv(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Apply the extraction and preservation to the 'prediction' column
    df['prediction'] = df['prediction'].apply(extract_and_preserve)

    # Save the modified DataFrame to a new CSV file
    output_file = file_path.replace(".csv", "_processed.csv")
    df.to_csv(output_file, index=False)

    print(f"Processing complete! File saved as {output_file}")

# Example usage
csv_file_path = "final_output.csv"  # Replace with your actual CSV file path
process_csv(csv_file_path)


In [None]:
import pandas as pd

# Define the entity-unit map and allowed units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Read the CSV file
df = pd.read_csv('final_output_processed.csv', header=None, names=['id', 'value'])

# Define a function to process units
def process_unit(value):
    value_str = str(value)  # Convert the value to string
    for unit in allowed_units:
        if unit.lower() in value_str.lower():
            return value_str  # Return the original value if unit is valid
    # If unit is not valid, default to 'inch'
    return value_str.split()[0] + ' inch'

# Apply the function to the 'value' column
df['value'] = df['value'].apply(process_unit)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_final.csv', index=False, header=False)

print("Units updated and saved to 'updated_final.csv'")


In [None]:
import pandas as pd
import re

# Define the entity-unit map and allowed units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Read the CSV file
df = pd.read_csv('updated_final.csv', header=None, names=['id', 'value'])

# Define a function to extract and validate the unit
def process_unit(value):
    value_str = str(value)
    
    # Extract potential unit from value
    match = re.search(r'([a-zA-Z\s]+)$', value_str)
    if match:
        extracted_unit = match.group(1).strip().lower()
        
        # Check if the extracted unit is valid
        if extracted_unit in allowed_units:
            return value_str
        
    # If unit is not valid or not found, default to 'inch'
    default_unit = 'inch'
    # Extract numerical part
    numerical_part = re.match(r'^[\d\.\[\],\s]+', value_str)
    if numerical_part:
        return numerical_part.group().strip() + ' ' + default_unit
    else:
        return '0 ' + default_unit

# Apply the function to the 'value' column
df['value'] = df['value'].apply(process_unit)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_final.csv', index=False, header=False)

print("Units updated and saved to 'updated_final.csv'")


In [None]:
import pandas as pd
import re

# Define the entity-unit map and allowed units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Read the CSV file
df = pd.read_csv('updated_final.csv', header=None, names=['id', 'value'])

# Define a function to process each entry
def process_entry(value):
    value_str = str(value)
    
    # Extract numerical value
    numerical_match = re.match(r'[\d\.]+', value_str)
    if numerical_match:
        numerical_value = float(numerical_match.group())
    else:
        numerical_value = 0.0  # Default to 0 if no numerical value is found
    
    # Extract unit
    unit_match = re.search(r'([a-zA-Z\s]+)$', value_str)
    if unit_match:
        extracted_unit = unit_match.group().strip().lower()
        if extracted_unit in allowed_units:
            return f"{numerical_value} {extracted_unit}"
    
    # Default to 'inch' if unit is not valid or not found
    default_unit = 'inch'
    return f"{numerical_value} {default_unit}"

# Apply the function to the 'value' column
df['value'] = df['value'].apply(process_entry)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_final.csv', index=False, header=False)

print("Units updated and saved to 'updated_final.csv'")


In [None]:
import pandas as pd

# Read the CSV files
updated_df = pd.read_csv('updated_final.csv', header=None, names=['index', 'value'])
test_df = pd.read_csv('test.csv', header=None, names=['index', 'value'])

# Convert 'index' columns to ensure proper matching
updated_df['index'] = updated_df['index'].astype(str)
test_df['index'] = test_df['index'].astype(str)

# Find indexes present in updated_final.csv but not in test.csv
indexes_in_updated_not_in_test = set(updated_df['index']) - set(test_df['index'])

# Filter rows in updated_final.csv based on indexes
filtered_df = updated_df[~updated_df['index'].isin(indexes_in_updated_not_in_test)]

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_updated_final.csv', index=False, header=False)

print("Filtered data saved to 'updated_final.csv'")
