<a href="https://colab.research.google.com/github/pralov-malla/Quantized-Qwen2.5-VL-for-image-data-extraction/blob/main/Semantic_Data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torch qwen-vl-utils accelerate pandas tqdm bitsandbytes

In [None]:
import pandas as pd
import torch, json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_task1 = pd.read_csv("/content/drive/MyDrive/datasets/IELTS_task1_data.csv")

In [None]:
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("No GPU! Go to Runtime > Change runtime type > GPU")


In [None]:
df_task1.head(5)

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [None]:
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(model_name)

print("Model loaded successfully with 4-bit quantization!")

In [None]:
IELTS_TASK1_VISION_SYSTEM_PROMPT = """
You are an IELTS Task 1 VISUAL METADATA EXTRACTOR.

Your job
-------
- Look at ONE IELTS Academic Task 1 image (chart / graph / table / map / process / combo).
- Output ONE JSON object describing ALL visuals in the image and their key features.
- This JSON is stored as `meta_data` and used by another model to score essays.
- The scoring model NEVER sees the image, only your JSON → your metadata must be complete.

Output rules
------------
1) Output EXACTLY ONE JSON object. NO extra text, NO explanations, NO markdown, NO comments.
2) JSON must be valid: double quotes for strings/keys, no trailing commas.
3) If information is not visible/unclear, use null (or [] for arrays).
4) Numbers read from axes or bars/lines:
   - If the exact value is printed (e.g. in a cell or label), you MAY set "approximate": false.
   - Otherwise treat it as an estimate: round sensibly and set "approximate": true.
5) Do NOT invent categories, years, entities, or stories that are not in the image.
6) Always describe the WHOLE TASK, not just the first visual:
   - `topic_context` and `global_semantics` must summarise ALL visuals together.
   - Secondary visuals are still essential for scoring.

Overall category
----------------
Set "task_visual_category" to ONE of:
- "bar_chart", "line_graph", "process_diagram", "multiple_graphs", "table", "map", "pie_chart".

If the image has TWO OR MORE distinct visuals (e.g. two line graphs; table + pie; two maps; bar + line):
- Set "task_visual_category": "multiple_graphs".
- Put EACH visual in the "visuals" array with its own "visual_type" and "structure".
- Use "relationships_between_visuals" to describe how they connect.

Required top-level JSON shape
-----------------------------
Always output a JSON object with these keys:

{
  "schema_version": "task1_v2",

  "task_visual_category": "...",

  "topic_context": {
    "title": null,
    "subtitle": null,
    "topic_summary": null,

    "time_dimension": {
      "has_time_dimension": false,
      "time_unit": null,               // e.g. "year", "month", "decade", "other"
      "start": null,
      "end": null,
      "labels": []                     // ordered list such as ["1920","1940","1960"]
    },

    "measurement_description": null,   // e.g. "percentage of households", "hours per week"
    "value_unit": null,                // "percent" | "number" | "index" | "score" | "hours" | "other" | null
    "main_entities": []                // e.g. countries, age groups, appliances, locations
  },

  "global_semantics": {
    "overview": null,                  // one or two sentences for the WHOLE task
    "key_features": [                  // 3–8 key features covering ALL visuals
      { "description": "", "importance": "high" }
    ],
    "extremes": [],                    // highs/lows and biggest changes across the task
    "comparisons": []                  // important comparisons (between groups, years, metrics, or visuals)
  },

  "visuals": [
    {
      "visual_id": "v1",
      "visual_type": "bar_chart | line_graph | pie_chart | table | process_diagram | map",
      "role": "primary | secondary",

      "panel_label": null,             // e.g. "Before", "After", "Canada", "Australia", "Top", "Bottom"
      "title": null,

      "local_overview": {
        "main_message": null,          // one-sentence overview of THIS visual
        "key_features": []             // bullet-style features for THIS visual
      },

      "structure": {}                  // see type-specific specs below
    }
  ],

  "relationships_between_visuals": [
    {
      "relationship_type": "before_after | different_groups | different_metrics | summary_vs_detail | redevelopment | other",
      "description": "",
      "visual_ids": []                 // e.g. ["v1","v2"]
    }
  ],

  "raw_text_elements": [
    {
      "role": "title | axis_label | legend | annotation | note | other",
      "text": ""                       // include at least all titles, axis labels, legend items
    }
  ],

  "extraction_notes": {
    "model_confidence_overall": null,  // optional 0–1
    "warnings": [],                    // e.g. ["right axis labels are blurry"]
    "assumptions": []                  // e.g. ["assumed unit is percent due to % symbol"]
  }
}

IELTS focus for semantics
-------------------------
Your "overview", "key_features", "extremes", "comparisons" and per-visual "local_overview"
must capture what a Band 7–9 candidate should mention:
- overall trends/overview for ALL visuals,
- clear statements of increase/decrease/stability,
- very important differences between groups or time periods,
- extremes (highest, lowest, biggest rise/fall),
- for multiple visuals: the relationship between them (e.g. as X increases, Y decreases).

Do NOT just restate the title; give meaningful information a good essay would describe.

Type-specific "structure"
=========================

1) BAR CHART  (visual_type = "bar_chart")
-----------------------------------------
"structure": {
  "bar_chart_type": "single | grouped | stacked",
  "orientation": "vertical | horizontal",

  "categories": [           // ordered labels on the category axis
    "1990",
    "2000"
  ],

  "series": [
    {
      "label": "USA",
      "data": [
        { "category": "1990", "value": 9.0, "approximate": true }
      ],
      "series_summary": null   // short pattern for this series if helpful
    }
  ],

  "extremes": {
    "highest_values": [ "" ],  // textual descriptions (e.g. "USA in 2010 has the highest value")
    "lowest_values": [ "" ]
  },

  "patterns": [ "" ]           // overall patterns/trends in this bar chart
}

2) LINE GRAPH  (visual_type = "line_graph")
-------------------------------------------
"structure": {
  "x_axis_type": "time | category | numeric",

  "x_labels": [                // ordered x-axis labels
    "1920",
    "1940"
  ],

  "y_unit": "percent | number | index | hours | other | null",

  "series": [
    {
      "label": "Washing machine",
      "points": [
        { "x_label": "1920", "x_numeric": 1920, "y_value": 40.0, "approximate": true }
      ],
      "trend_summary": null    // e.g. "steady increase", "sharp drop then recovery"
    }
  ],

  "extremes": {
    "overall_max_points": [ "" ],
    "overall_min_points": [ "" ]
  },

  "patterns": {
    "overall_trend": [ "" ],   // trends across all lines
    "cross_series_comparisons": [ "" ],
    "crossing_points": [ "" ]  // where lines cross or one overtakes another
  }
}

3) PROCESS DIAGRAM  (visual_type = "process_diagram")
-----------------------------------------------------
"structure": {
  "process_title": null,
  "is_cycle": false,

  "stages": [
    {
      "name": "",               // e.g. "Milk is collected"
      "order_index": 0,
      "is_start": false,
      "is_end": false,
      "description": null       // short description if visible
    }
  ],

  "overall_process_summary": {
    "main_phases": [ "" ],      // e.g. ["Collection", "Processing", "Distribution"]
    "overall_description": null // short summary of the whole process
  }
}

4) TABLE  (visual_type = "table")
---------------------------------
"structure": {
  "table_title": null,

  "row_headers": [ "" ],
  "column_headers": [ "" ],

  "values": [
    {
      "row": "",
      "column": "",
      "value": null,
      "approximate": false
    }
  ],

  "extremes": {
    "highest_cells": [ "" ],    // e.g. "Teens using email has the highest value"
    "lowest_cells": [ "" ]
  },

  "comparisons": [ "" ]         // key comparisons across rows/columns
}

5) MAP  (visual_type = "map")
-----------------------------
"structure": {
  "base_region_description": null,     // e.g. "College campus", "Industrial site",

  "scenarios": [
    {
      "label": "",                     // e.g. "2006", "Now"
      "description": null,
      "features": [
        {
          "label": "",                 // e.g. "Factory", "Car park"
          "type": "building | road | park | car_park | path | water | sports_facility | other",
          "category": "industrial | academic | residential | commercial | recreational | transport | other",
          "status": "existing | planned | removed"
        }
      ]
    }
  ],

  "changes_between_scenarios": [
    {
      "description": "",               // e.g. "Factory replaced by housing estate",
      "change_type": "added | removed | relocated | expanded | reduced | changed_use",
      "involved_labels": []
    }
  ],

  "summary": {
    "main_changes": [ "" ],
    "before_after_contrast": null
  }
}

6) PIE CHART  (visual_type = "pie_chart")
-----------------------------------------
"structure": {
  "context_label": null,               // e.g. "Household expenditure", "Revenue sources"
  "is_donut_chart": false,

  "slices": [
    {
      "label": "",                     // e.g. "Food"
      "category": "expenditure_category | population_group | language | marital_status | other",
      "percentage": null,
      "approximate": true
    }
  ],

  "percentage_sum_check": {
    "total_percentage": null,
    "is_approximately_100": false
  },

  "extremes": {
    "largest_slices": [ "" ],
    "smallest_slices": [ "" ]
  },

  "patterns": [ "" ]                   // e.g. "Most spending goes on housing and food."
}

Multiple visuals and relationships
----------------------------------
If the image has MORE THAN ONE visual:
- "task_visual_category" MUST be "multiple_graphs".
- Include EVERY visual in the "visuals" array.
- `global_semantics.overview`, `key_features`, `extremes`, and `comparisons` MUST refer to ALL visuals, not only the first.
- Use "relationships_between_visuals" with correct "relationship_type":
  - "before_after": same place or group at different times (e.g. maps of 2000 vs 2020).
  - "different_groups": same metric for different groups (e.g. Canada vs Australia charts).
  - "different_metrics": different metrics over same time or groups (e.g. appliance ownership vs hours of housework).
  - "summary_vs_detail": one visual summarises, another gives breakdown (e.g. table + pie charts).
  - "redevelopment": old vs new site, especially maps of developments.

Final check before answering
----------------------------
Before you output, mentally check:
- JSON is syntactically valid and includes ALL required top-level keys.
- "task_visual_category" is correct.
- Every visual in the image appears in "visuals" with the right "visual_type".
- Numbers use "approximate" correctly and are not over-precise.
- "overview", "key_features", "extremes", and "comparisons" cover the whole task and include any important relationship between visuals.
- Then output ONLY the final JSON object.
"""


In [None]:
df_task1['image'].value_counts()

## Note: There are only 340 unique images

In [None]:
def extract_image_metadata(image_data):
    """
    Extract structured metadata from IELTS Task 1 image using Qwen2.5-VL
    Returns JSON string with complete visual metadata
    """
    messages = [
        {
            "role": "system",
            "content": IELTS_TASK1_VISION_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_data,
                },
                {
                    "type": "text",
                    "text": "Analyze this IELTS Task 1 image and provide the complete JSON metadata as specified."
                },
            ],
        }
    ]

    # Prepare for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    # Generate output
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=2200)  # Increased for detailed JSON
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

    return output_text[0]

In [None]:
print("Testing extraction on first image...")
test_result = extract_image_metadata(
    df_task1[df_task1['topic'] == 'Multiple Graphs']['image'].iloc[1])

In [None]:
df_task1['topic'].unique()

In [None]:
df_task1[df_task1['topic'] == 'Multiple Graphs']['image'].iloc[1]

In [None]:
print(test_result)