Data Extraction_csv


In [None]:
import pandas as pd

# Define the input and output file paths
input_file_path = '../hindi-visual-genome-train.txt'
output_file_path = '../hindi-visual-genome-train.csv'

# Read the input data from the txt file
data = []
with open(input_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line by tab characters
        columns = line.strip().split('\t')
        # Append the necessary columns (X, Y, Width, Height) to the data list
        data.append([columns[1], columns[2], columns[3], columns[4]])

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['X', 'Y', 'Width', 'Height'])

# Save the DataFrame to a CSV file
df.to_csv(output_file_path, index=False)

print(f"Data has been successfully extracted and saved to {output_file_path}")


In [None]:
# @title Default title text
import pandas as pd

# Function to process a single line of the text file
def process_line(line):
    parts = line.strip().split('\t')
    if len(parts) != 7:
        return None

    try:
        image_id, center_x, center_y, width, height, english_text, hindi_text = parts

        # Convert values to floats
        center_x = float(center_x)
        center_y = float(center_y)
        width = float(width)
        height = float(height)

    except ValueError:
        # Skip lines where conversion to float fails
        print(f"Skipping line due to conversion error: {line.strip()}")
        return None

    # Calculate bounding box coordinates
    x_min = center_x - width / 2
    y_min = center_y - height / 2
    x_max = center_x + width / 2
    y_max = center_y + height / 2

    return [image_id, center_x, center_y, width, height, x_min, y_min, x_max, y_max, english_text, hindi_text]

# Read the text file and create a DataFrame
input_txt_file = '.../hindi-visual-genome-train.txt'
data_txt = []

with open(input_txt_file, 'r', encoding='utf-8') as file:
    for line in file:
        processed_line = process_line(line)
        if processed_line:
            data_txt.append(processed_line)

# Create a DataFrame from the text file data
columns_txt = ['image_id', 'X', 'Y', 'Width', 'Height', 'Xmin', 'Ymin', 'Xmax','Ymax', 'English Text', 'Hindi Text']
df_txt = pd.DataFrame(data_txt, columns=columns_txt)

# Ensure that the columns are of the same type before merging
df_txt['X'] = df_txt['X'].astype(float)
df_txt['Y'] = df_txt['Y'].astype(float)
df_txt['Width'] = df_txt['Width'].astype(float)
df_txt['Height'] = df_txt['Height'].astype(float)

# Read the existing CSV file
input_csv_file = '.../hindi-visual-genome-train.csv'
df_csv = pd.read_csv(input_csv_file)

# Check and remove non-numeric values in 'Height' column
df_csv = df_csv[pd.to_numeric(df_csv['Height'], errors='coerce').notna()]

# Convert columns to float
df_csv['X'] = df_csv['X'].astype(float)
df_csv['Y'] = df_csv['Y'].astype(float)
df_csv['Width'] = df_csv['Width'].astype(float)
df_csv['Height'] = df_csv['Height'].astype(float)

# Merge the text DataFrame with the CSV DataFrame on 'X', 'Y', 'Width', and 'Height' columns
df_merged = pd.merge(df_csv, df_txt, on=['X', 'Y', 'Width', 'Height'])

# Save the resulting DataFrame to a new CSV file
output_file = '.../hindi-visual-genome-train1.csv'
df_merged.to_csv(output_file, index=False)

print(f"Processing complete. Check the {output_file} file for results.")

Absolute conversion

In [None]:
import pandas as pd
import os

# Read the CSV into a DataFrame
df = pd.read_csv(".../hindi-visual-genome-train1.csv")

# Convert the specified columns to absolute integer values
columns_to_convert = ['Xmin', 'Ymin', 'Xmax', 'Ymax']
df[columns_to_convert] = df[columns_to_convert].applymap(lambda x: int(x))

# Save the modified DataFrame back to a CSV file
output_file = ".../hindi-visual-genome-train2.csv"
df.to_csv(output_file, index=False)

print(f"The modified CSV file has been saved to {output_file}")


Making into required format

In [None]:
import pandas as pd
import base64
import json

# Read the input CSV file
input_csv_file = '.../hindi-visual-genome-train2.csv'  # Update this path to the actual file location
output_json_file = '.../hindi-visual-genome-train.json'  # Update this path to the desired output file location

# Load the CSV file into a DataFrame
df = pd.read_csv(input_csv_file)

# Define a placeholder for the base64 encoded image
encoded_image = "<base64 encode image>"

# Define the function to process each row
def process_row(row):
    image_id = row['image_id']
    y_min = row['Ymin']
    x_min = row['Xmin']
    y_max = row['Ymax']
    x_max = row['Xmax']
    Hindi_text = row['Hindi Text']

    # Construct the JSON structure
    json_object = {
        "id": str(image_id),
        "message": [
            {
                "content": f"describe the image in hindi <loc{y_min}><loc{x_min}><loc{y_max}><loc{x_max}>",
                "role": "user"
            },
            {
                "content": hindi,
                "role": "assistant"
            }
        ],
        "images": encoded_image  # Use the placeholder for base64 encoded image
    }

    return json_object

# Process each row and collect the JSON objects
json_data = [process_row(row) for _, row in df.iterrows()]

# Write the JSON data to a file
with open(output_json_file, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print(f"Processing complete. The JSON file has been saved to {output_json_file}")
