### **Creating Dataset For T5 fine tuning**

In [99]:
import pandas as pd
import random

Charts
🔹 1. Bar & Categorical Charts

Bar Chart

Grouped Bar Chart

Stacked Bar Chart

Horizontal Bar Chart

🔹 2. Line & Area Charts

Line Chart

Multi-Line Chart

Step Line Chart

Area Chart

Stacked Area Chart

🔹 3. Distribution & Summary Charts

Histogram

Box Plot

Violin Plot

Pie Chart

Donut Chart

🔹 4. Business & KPI Charts

Treemap

Waterfall Chart

Funnel Chart

Gauge Chart

KPI Card / Metric

Combo Chart (Bar + Line on Dual Axis)

🔹 5. Correlation & Relationships

Scatter Plot

Bubble Chart

Correlation Heatmap

🔹 6. Geographic & Map Visuals

Geo Scatter Map

Choropleth Map

## Chart Text

### 1. Bar & Categorical Charts

In [100]:

# === X and Y options ===
x_options = [
    "product", "region", "month", "category", "department", "city", "country",
    "mission", "mission_type", "mission type", "program", "area", "zone", "district",
    "subject", "class", "course", "project", "team", "gender", "age group",
    "income group", "education level", "state", "continent", "year", "quarter",
    "platform", "source", "campaign", "device", "customer type", "service",
    "incident type", "alert type", "status", "type", "location", "sector",
    "industry", "browser", "os", "brand", "store", "channel", "model", "variant",
    "mode", "segment", "specialization","year"
]

y_options = [
    "sales", "revenue", "profit", "amount", "budget", "expenses", "score", "rating",
    "duration", "time", "cost", "income", "turnover", "performance", "success_rate",
    "failure rate", "completion rate", "accuracy", "efficiency", "satisfaction",
    "response time", "clicks", "views", "impressions", "downloads", "votes",
    "likes", "dislikes", "usage", "frequency", "count", "population", "cases",
    "alerts", "errors", "issues", "incidents", "losses", "assets", "balance",
    "temperature", "humidity", "rainfall", "pollution", "concentration",
    "speed", "delay", "growth", "decline", "reach"
]

# === Filters ===
filter_phrases = [
    "in 2023", "in the year 2022", "after 2021", "before 2020", ">= 2022",
    "<= 2021", "equal to 2023", "not equal to 2020", "where region is Asia",
    "where region = Asia", "only for India", "if year is 2023", "mission type is Mars",
    "when gender is Female", "for category A", "where program = Apollo", "sales > 500",
    "sales less than 1000", "amount >= 100", "amount < 300", "for first quarter",
    "if month is June", "campaign equals BlackFriday", "source is YouTube",
    "platform = Android", "for the department Finance", "device is Mobile",
    "continent = Europe", "status is Active", "budget greater than 1000","where age group = 30–40"
]

# === Bar chart templates ===
bar_templates = [
    "Show bar chart for {y} by {x}",
    "Plot {y} grouped by {x}",
    "Draw bar chart of {x} vs {y}",
    "Visualize {y} for each {x}",
    "Give bar chart showing {x} and their {y}",
    "Create a bar graph for {y} across {x}",
    "Bar chart of total {y} per {x}",
    "How does {y} vary by {x}?",
    "Give a graph of {y} against {x}",
    "Display bar graph of {x} by {y}",
    "Bar chart for {x} and {y}",
    "I want bar chart of {y} by {x}",
    "Graph showing {x} vs {y}",
    "Chart: {y} grouped by {x}",
    "Can you make a bar chart using {x} and {y}?",
    "bar of {x} and {y}",
    "bar chart for each {x} with value of {y}",
    "Draw a bar chart: {x}, {y}",
    "Make chart showing {x} with {y}",
    "Compare {y} by {x} in bar",
    "Show me bar chart of {x} by {y}",
    "Chart {y} against {x} using bars",
    "Visual of {x} and {y} in bars",
    "Bar representation of {y} across {x}",
    "bargraph: {x} vs {y}",
    "display {y} by different {x} values",
    "total {y} grouped over {x} in bar",
    "x wise bar chart for {y}",
    "Bar plot showing {x} with {y}",
    "Graph {x} against {y} (bar)"
]

# === Noise function ===
def introduce_noise(text):
    noise_type = random.choice(["spelling", "case", "grammar"])
    if noise_type == "spelling" and len(text) > 5:
        i = random.randint(0, len(text) - 2)
        return text[:i] + text[i+1] + text[i] + text[i+2:]
    elif noise_type == "case":
        return text.lower() if random.random() < 0.5 else text.upper()
    elif noise_type == "grammar":
        words = text.split()
        if len(words) > 4:
            words.pop(random.randint(1, len(words) - 2))
        return " ".join(words)
    return text

# === Generate samples ===
samples = []
samples_per_template = 100
filter_probability = 0.4  # 40% filtered
noise_probability = 0.15  # 15% noisy

for template in bar_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text += f" {filter_text}"
            target_text = f"chart: bar, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: bar, x: {x}, y: {y}"

        # Add noise
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})

# === Save to CSV ===
df = pd.DataFrame(samples)


In [101]:
df.shape

(3000, 2)

### 2. grouped bar chart

In [102]:
group_options = [
    "region", "gender", "category", "department", "age group", "year",
    "quarter", "month", "city", "country", "state", "zone", "district",
    "team", "campaign", "platform", "source", "program", "project type",
    "device", "browser", "status", "education level", "income_group",
    "continent", "mission type", "alert type", "customer type",
    "type", "store"
]

grouped_templates = [
    "Show grouped bar chart of {y} by {x} and {group}",
    "Grouped bars for {y} by {x} and {group}",
    "Compare {y} across {x} grouped by {group}",
    "Grouped chart for {y} by {x} with groups {group}",
    "Bar chart for {x} vs {y} grouped on {group}",
    "Chart {y} by {x} and group with {group}",
    "I want grouped bar of {y} using {x} grouped by {group}",
    "Grouped bar: {x}, {y}, {group}",
    "Visualize {y} for each {x} with grouping by {group}",
    "Grouped bars showing {x} and {group} for {y}",
    "Plot {y} grouped by {x} and then {group}",
    "Make grouped bar comparing {x} by {group} with {y}",
    "Give a grouped bar chart of {x} and {group} using {y}",
    "Bar chart with grouping on {group} showing {y} by {x}",
    "Can you display grouped bar: {y}, {x}, {group}?",
    "Grouped chart: show {y} for {x} grouped by {group}",
    "Graph of {y} across {x}, split by {group}",
    "Create grouped bar showing {x}, {group}, and values of {y}",
    "I want bar chart with {x} on x-axis, {y} as value, grouped by {group}",
    "Grouped visualization for {y} by {x} & {group}",
    "Chart of {y} over {x} separated by {group}",
    "How does {y} differ for {x} with groups from {group}?",
    "Use grouped bars to compare {y} by {x} split by {group}",
    "Bar comparison of {x} with {group} for {y}",
    "Render grouped bar of {x}, {group}, and their {y}",
    "Make a bar graph comparing {y} across {x}, grouped by {group}",
    "Bar analysis of {y} over {x} per {group}",
    "Breakdown of {y} by {x} grouped by {group}",
    "Draw grouped bar of {x} with {y} and split by {group}",
    "Grouped bar graph: {x}, {group}, value = {y}"
]


# 🔹 Settings
filter_probability = 0.4
samples = []

# 🔹 Generate Data
for template in grouped_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        group = random.choice(group_options)

        input_text = template.format(x=x, y=y, group=group)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text += f" {filter_text}"
            target = f"chart: grouped_bar, x: {x}, y: {y}, group: {group}, filter: {filter_text}"
        else:
            target = f"chart: grouped_bar, x: {x}, y: {y}, group: {group}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target})

# 🔹 Save to CSV
df_grouped = pd.DataFrame(samples)

print(f" Saved {len(df_grouped)} grouped bar chart samples.")

 Saved 3000 grouped bar chart samples.


In [103]:
df = pd.concat([df, df_grouped], ignore_index=True)


In [104]:
df.shape

(6000, 2)

## 3  Stacked Bar Chart

In [105]:
stack_options = [
    "gender", "category", "region", "department", "year", "quarter", "month",
    "platform", "device", "status", "program", "source", "campaign", "country",
    "age group", "education level", "continent", "mission type", "product type",
    "channel", "zone", "segment", "browser", "os", "state", "project phase",
    "variant", "model", "income group", "alert type"
]

stacked_bar_templates = [
    "Show stacked bar chart for {y} by {x} stacked by {stack}",
    "Create a stacked bar graph with {x} and {y} layered by {stack}",
    "Plot {y} against {x}, stack by {stack}",
    "Visualize {y} by {x} and stack using {stack}",
    "Stack {y} on {x} using {stack} levels",
    "Bar chart with stacked {y} values grouped by {x} and {stack}",
    "Display stacked bars of {y} per {x}, split by {stack}",
    "Compare {y} by {x}, stacked with {stack}",
    "Stack {stack} within bars showing {y} for {x}",
    "Use {stack} to stack bars of {y} over {x}",
    "Can you give stacked bar of {y} across {x} with {stack} as layers?",
    "Give a stacked bar showing {x} vs {y}, layered by {stack}",
    "Stacked bar plot for {y} over {x} using {stack}",
    "Make a bar chart for {x}, stacked {y} by {stack}",
    "Group {y} on {x} with stacking via {stack}",
    "Stacked visualization for {x} with {y}, use {stack}",
    "Layer {y} values on top of {x} by {stack}",
    "Plot stacked bar with {x} categories, {y} values and stack by {stack}",
    "Stack the {stack} categories in bar for {y} across {x}",
    "Use {stack} as grouping variable to stack {y} bars by {x}",
    "Stacked bar showing {y} for each {x}, grouped by {stack}",
    "Layered bar chart: {x}, {y}, and stack by {stack}",
    "How does {y} vary by {x} with stacked {stack}?",
    "Create a multi-color stacked bar for {x}, {y}, and {stack}",
    "Stacked plot showing {x} categories, {y} amounts, and {stack}",
    "Make bars with {x} on axis, stack {y} by {stack}",
    "Show grouped stacked chart for {x} and {y} with stack {stack}",
    "Build stacked bars to represent {y} by {x}, layered by {stack}",
    "Create bar chart using {x} vs {y} and add {stack} for stacked levels",
    "I want a stacked bar chart of {y} by {x} using {stack}"
]

stacked_samples = []

filter_probability = 0.4

for template in stacked_bar_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        stack = random.choice(group_options)
        input_text = template.format(x=x, y=y, stack=stack)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: stacked_bar, x: {x}, y: {y}, stack: {stack}, filter: {filter_text}"
        else:
            target_text = f"chart: stacked_bar, x: {x}, y: {y}, stack: {stack}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        stacked_samples.append({"input": input_text, "target": target_text})

df_stacked = pd.DataFrame(stacked_samples)


In [106]:
df = pd.concat([df, df_stacked], ignore_index=True)


In [107]:
df.shape

(9000, 2)

### 4 Horizontal Bar Chart

In [108]:
horizontal_bar_templates = [
    "Show horizontal bar chart of {y} by {x}",
    "Draw a horizontal bar plot for {x} vs {y}",
    "Create horizontal bar graph for {y} grouped by {x}",
    "Give horizontal bars showing {x} and their {y}",
    "Visualize {y} by {x} in horizontal bars",
    "Horizontal bars for each {x} with value {y}",
    "Display a horizontal bar chart using {x} and {y}",
    "Horizontal bar graph of total {y} per {x}",
    "Plot {y} on horizontal axis with categories {x}",
    "Use horizontal bars to represent {y} by {x}",
    "Graph horizontally: {x} vs {y}",
    "Bar chart with horizontal orientation: {x} and {y}",
    "Make horizontal bar chart for {y} across {x}",
    "Visual: horizontal bar of {x} and {y}",
    "Bar chart (horizontal) for {x} and {y}",
    "Show me horizontal bar for {y} vs {x}",
    "Chart showing horizontal bars of {y} grouped by {x}",
    "Compare {x} and {y} using horizontal bars",
    "Plot {y} values using horizontal bars for {x}",
    "Create a horizontal chart for {x} and {y}",
    "Horizontal bar plot with x: {x}, y: {y}",
    "Graph {y} along horizontal axis with categories {x}",
    "Build a horizontal bar visual with {x} and {y}",
    "Chart with horizontal bars showing {x} vs {y}",
    "Horizontally visualize {y} by {x}",
    "Horizontal grouped bars for {x} and {y}",
    "Horizontal visualization of {y} across {x}",
    "Render horizontal bar chart for {y} by {x}",
    "Generate horizontal bars for {x} and {y}",
    "Horizontal plot of {x} with values of {y}"
]

samples = []

filter_probability = 0.4 

for template in horizontal_bar_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text += f" {filter_text}"
            target_text = f"chart: horizontal_bar, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: horizontal_bar, x: {x}, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target_text})

# Save to CSV (append or merge later if needed)
df_horizontal_bar = pd.DataFrame(samples)


In [109]:
df = pd.concat([df, df_horizontal_bar], ignore_index=True)


In [110]:
df.shape

(12000, 2)

### 5 Line Chart

In [111]:
line_chart_templates = [
    "Show line chart for {y} over {x}",
    "Draw a line graph with {x} and {y}",
    "Plot line chart showing {y} across {x}",
    "Visualize {y} against {x} as a line chart",
    "Line chart of {y} by {x}",
    "Line graph representing {y} by {x}",
    "Give line chart for {x} and {y}",
    "Line plot: {x} vs {y}",
    "Chart {y} across {x} with a line graph",
    "Draw a line showing {x} and corresponding {y}",
    "Make a line chart for {x} and {y}",
    "Create line chart to visualize {y} over {x}",
    "Plot the trend of {y} over {x}",
    "Show trend line for {y} by {x}",
    "Display a line graph of {y} by {x}",
    "Graph {y} as line against {x}",
    "Chart line for {y} with respect to {x}",
    "Line diagram showing {x} and {y}",
    "Plot {y} values along {x} using lines",
    "Generate line graph showing {y} across {x}",
    "I want line chart of {y} vs {x}",
    "Trend of {y} across {x} in a line plot",
    "Display {x} and {y} in a line chart",
    "Plotting {y} with respect to {x} as line",
    "Line plot of total {y} over {x}",
    "Give line visualization of {y} over {x}",
    "Plot {y} by each {x} in a line chart",
    "Line chart to track {y} over {x}",
    "Show time-series of {y} vs {x}",
    "Line graph tracking {x} and {y}"
]

line_chart_samples = []

filter_probability = 0.4  
for template in line_chart_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text += f" {filter_text}"
            target_text = f"chart: line, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: line, x: {x}, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        line_chart_samples.append({"input": input_text, "target": target_text})

df_line = pd.DataFrame(line_chart_samples)


In [112]:
df = pd.concat([df, df_line], ignore_index=True)


In [113]:
df.shape

(15000, 2)

### 6 Multi-Line Chart

In [114]:
line_group_options = [
    "region", "product", "gender", "platform", "category", "campaign", "store",
    "device", "channel", "department", "program", "source", "status", "age group",
    "education level", "country", "browser", "brand", "type", "model", "project"
]


In [115]:
multiline_templates = [
    "Show line chart for {y} over {x} grouped by {group}",
    "Plot {y} across {x} for each {group}",
    "Draw multiple lines of {y} by {x} split by {group}",
    "Compare {group} in a line chart over {x} showing {y}",
    "Create multi-line chart for {y} by {x} per {group}",
    "Visualize {y} over {x} for different {group}",
    "Line chart of {y} vs {x}, grouped by {group}",
    "Multi-line chart for {y} across {x} by {group}",
    "I want line graph showing {x} and {y} for {group}",
    "Compare {y} trends over {x} among {group}",
    "Graph showing {x} and {y} for each {group}",
    "Time-series of {y} by {x} and group by {group}",
    "Give me multi-line of {y} across {x} grouped with {group}",
    "Multiple lines of {y} per {group} along {x}",
    "Line plot of {y} for every {group} over {x}",
    "Trends of {y} per {group} over {x}",
    "Draw {y} on {x} with each line representing {group}",
    "Visual representation of {y} across {x} per {group}",
    "Line graph of {y} against {x} for {group}",
    "Group-based line chart of {y} over {x}",
    "Display {y} by {x}, one line per {group}",
    "Show {y} trend by {x} for all {group}",
    "Multi-line trend for {y} over {x} grouped on {group}",
    "Compare {y} by {x} split among {group}",
    "Chart: {y} vs {x} lines grouped by {group}",
    "Plot of {y} across {x} with categories of {group}",
    "Generate line plot showing {group} trends of {y} on {x}",
    "Show me different lines for {y} over {x} for {group}",
    "Line chart comparison of {group} on {x} for {y}",
    "Draw line graph {y} over {x} with {group} as line separator"
]
samples = []
samples_per_template = 100
filter_probability = 0.4

for template in multiline_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        group = random.choice(group_options)

        input_text = template.format(x=x, y=y, group=group)

        # Add filter in ~40% cases
        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: multi_line, x: {x}, y: {y}, group: {group}, filter: {filter_text}"
        else:
            target_text = f"chart: multi_line, x: {x}, y: {y}, group: {group}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target_text})


df_multiline = pd.DataFrame(samples)



In [116]:
df = pd.concat([df, df_multiline], ignore_index=True)


In [117]:
df.shape

(18000, 2)

### 7 Step Line Chart

In [118]:

step_templates = [
    "Show step line chart of {y} over {x}",
    "Plot {y} using step chart across {x}",
    "Create a step plot for {x} and {y}",
    "Draw step-wise trend of {y} over {x}",
    "Step chart for {x} vs {y}",
    "Chart showing steps for {y} by {x}",
    "Use step line to show {y} across {x}",
    "Step graph for {y} with respect to {x}",
    "Graph of {x} and {y} in step format",
    "Step-based trend line for {y} on {x}",
    "Compare {y} changes over {x} using steps",
    "Stepwise progression of {y} over {x}",
    "Plot step function of {y} vs {x}",
    "How {y} progresses step-by-step by {x}",
    "Step diagram showing {x} and {y}",
    "Visualize {x} and {y} in step line chart",
    "Step line showing changes in {y} by {x}",
    "Draw steps for {y} values over {x}",
    "Show time-based step chart for {y} and {x}",
    "Step pattern for {y} across {x}",
    "Give a step chart: {x} on x-axis and {y} on y-axis",
    "Chart for {y} vs {x} in step format",
    "Step transitions of {y} across {x}",
    "Plot steps in {y} by varying {x}",
    "Time-wise step plot of {y} by {x}",
    "Step trend: {y} vs {x}",
    "Chart of {y} by {x} using step lines",
    "Step line display for {y} over {x}",
    "Draw timeline of {y} with steps over {x}",
    "Create step series for {y} and {x}"
]


samples = []
samples_per_template = 100
filter_probability = 0.4
group_probability = 0.3

for template in step_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)
        target_text = f"chart: step_line, x: {x}, y: {y}"

        
        if random.random() < group_probability:
            group = random.choice(group_options)
            input_text += f" by {group}"
            target_text += f", group: {group}"

        
        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text += f" {filter_text}"
            target_text += f", filter: {filter_text}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target_text})


df_step = pd.DataFrame(samples)



In [119]:
df = pd.concat([df, df_step], ignore_index=True)


In [120]:
df.shape

(21000, 2)

### 8 Area Chart

In [121]:

area_templates = [
    "Show area chart of {y} over {x}",
    "Plot {y} as area grouped by {x}",
    "Create an area chart showing {x} and {y}",
    "Visualize {y} across {x} in an area chart",
    "Area plot for {y} vs {x}",
    "Area chart showing change in {y} by {x}",
    "Give area visualization of {y} across {x}",
    "Display {x} with area for {y}",
    "Draw area chart using {x} and {y}",
    "Chart of {y} over {x} with area filled"
]


samples = []
samples_per_template = 100
filter_probability = 0.4

for template in area_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: area, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: area, x: {x}, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target_text})


df_area = pd.DataFrame(samples)



In [122]:
df = pd.concat([df, df_area], ignore_index=True)

In [123]:
df.shape

(22000, 2)

## 9 Stacked Area Chart

In [124]:

stacked_area_templates = [
    "Show stacked area chart of {y} over {x} stacked by {stack}",
    "Create stacked area plot using {x}, {y}, and {stack}",
    "Draw an area chart with {y} over {x}, split by {stack}",
    "Plot {y} by {x} using {stack} as stack field",
    "Visualize stacked area chart: x={x}, y={y}, stack={stack}",
    "Stacked area for {y} vs {x} grouped by {stack}",
    "Chart {y} over {x} by stacking {stack}",
    "Display {stack} split in area chart of {y} by {x}",
    "Area chart using {x}, with {y} stacked per {stack}",
    "Breakdown {y} across {x} using stacked area chart with {stack}"
]


samples = []
samples_per_template = 100
filter_probability = 0.4

for template in stacked_area_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        stack = random.choice(stack_options)

        input_text = template.format(x=x, y=y, stack=stack)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: stacked_area, x: {x}, y: {y}, stack: {stack}, filter: {filter_text}"
        else:
            target_text = f"chart: stacked_area, x: {x}, y: {y}, stack: {stack}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        samples.append({"input": input_text, "target": target_text})


df_stacked_area = pd.DataFrame(samples)


In [125]:
df = pd.concat([df, df_stacked_area], ignore_index=True)

In [126]:
df.shape

(23000, 2)

### 10 Histogram

In [127]:

histogram_templates = [
    "Show histogram of {y}",
    "Plot histogram for {y}",
    "Create histogram of {y} values",
    "Visualize the distribution of {y}",
    "Give histogram showing distribution of {y}",
    "Draw a histogram chart for {y}",
    "Histogram of {y}",
    "Chart the frequency distribution of {y}",
    "Bar-style histogram for {y}",
    "How is {y} distributed?",
    "Show how {y} is spread in histogram",
    "Frequency histogram of {y}",
    "Histogram displaying {y}",
    "Distribution plot of {y}",
    "Show bars for distribution of {y}"
]

samples = []
samples_per_template = 100
filter_probability = 0.4

for template in histogram_templates:
    for _ in range(samples_per_template):
        y = random.choice(y_options)
        input_text = template.format(y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: histogram, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: histogram, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_histogram = pd.DataFrame(samples)


In [128]:
df = pd.concat([df, df_histogram], ignore_index=True)

In [129]:
df.shape

(24500, 2)

### 11 Box Plot

In [130]:


box_templates = [
    "Show box plot of {y} by {x}",
    "Create boxplot of {y} across {x}",
    "Visualize distribution of {y} for each {x}",
    "Draw a box chart for {x} with respect to {y}",
    "Box plot comparing {y} for different {x}",
    "Give a boxplot showing {y} grouped by {x}",
    "Plot {y} grouped by {x} as box plot",
    "Boxplot for {x} and {y}",
    "How does {y} vary with {x} (box plot)?",
    "Display {y} distribution by {x} in box form",
    "Box chart: {x} vs {y}",
    "Box plot of {x} vs {y}",
    "boxplot of {x} and {y}",
    "Chart boxplot {y} grouped by {x}",
    "Plot box style chart for {y} by {x}"
]


samples = []
samples_per_template = 100
filter_probability = 0.4

for template in box_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: box, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: box, x: {x}, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_box = pd.DataFrame(samples)


In [131]:
df = pd.concat([df, df_box], ignore_index=True)

In [132]:
df.shape

(26000, 2)

### 12 Violin Plot

In [133]:

violin_templates = [
    "Show violin plot of {y} by {x}",
    "Plot a violin chart for {x} and {y}",
    "Create violin plot grouped by {x} for {y}",
    "Visualize {y} distribution by {x} using a violin plot",
    "Draw a violin graph showing {y} across {x}",
    "Violin plot of {y} over {x}",
    "Chart {y} across different {x} using violin",
    "Display {x} vs {y} in violin plot",
    "Give violin distribution of {y} by {x}",
    "Violin plot: {x} categories and {y} spread",
    "Show category-wise {y} using violin chart",
    "Violin style plot of {x} against {y}",
    "Box and density plot (violin) of {y} across {x}",
    "Compare {x} on {y} using violin plots",
    "Draw violin shape distribution of {y} over {x}"
]

samples = []
samples_per_template = 100
filter_probability = 0.4

for template in violin_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: violin, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: violin, x: {x}, y: {y}"
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})

df_violin = pd.DataFrame(samples)


In [134]:
df = pd.concat([df, df_violin], ignore_index=True)

In [135]:
df.shape

(27500, 2)

### 13 Pie Chart

In [136]:

pie_templates = [
    "Show pie chart of {y} by {x}",
    "Create a pie chart for {x} with {y} values",
    "Pie chart showing {x} and their share of {y}",
    "Visualize {y} distribution across {x} using pie chart",
    "Draw pie chart of {x} based on {y}",
    "Make a pie of {x} using {y}",
    "Pie graph for {x} grouped by {y}",
    "I want a pie chart for {y} and {x}",
    "Give a pie showing proportion of {y} for each {x}",
    "Display pie chart of {x} with {y}",
    "Pie chart of total {y} by {x}",
    "Chart type pie, x = {x}, y = {y}",
    "Pie visualization for {x} and {y}",
    "Represent {y} split by {x} as pie",
    "Use pie chart to show {y} across {x}"
]

samples = []
samples_per_template = 100
filter_probability = 0.4

for template in pie_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: pie, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: pie, x: {x}, y: {y}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_pie = pd.DataFrame(samples)


In [137]:
df = pd.concat([df, df_pie], ignore_index=True)

In [138]:
df.shape

(29000, 2)

### 14 Donut Chart

In [139]:

donut_templates = [
    "Show donut chart of {y} by {x}",
    "Donut chart showing {x} with values of {y}",
    "Visualize {y} for each {x} in a donut chart",
    "Create donut chart using {x} and {y}",
    "Donut graph showing proportion of {y} per {x}",
    "Display donut chart for {y} grouped by {x}",
    "Donut representation of {x} based on {y}",
    "Draw a donut with {x} and {y}",
    "Plot donut chart using {x} and their {y}",
    "Generate donut chart with {x} vs {y}",
    "I want donut chart of {y} by {x}",
    "Donut chart with x = {x}, y = {y}",
    "Make donut chart for showing {x} and {y}",
    "How {y} is split by {x} using donut",
    "Chart {x} vs {y} in donut format"
]


samples = []

for template in donut_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: donut, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: donut, x: {x}, y: {y}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_donut = pd.DataFrame(samples)



In [140]:
df = pd.concat([df, df_donut], ignore_index=True)

In [141]:
df.shape

(30500, 2)

### 15 Treemap

In [142]:

treemap_templates = [
    "Show treemap of {y} by {x}",
    "Create a treemap for {x} and {y}",
    "Treemap showing {x} based on {y}",
    "Visualize {y} across {x} using treemap",
    "Plot treemap with {x} as category and {y} as value",
    "Generate treemap of {x} and {y}",
    "Display treemap where {x} is categorized by {y}",
    "Draw treemap of {x} vs {y}",
    "Treemap visualization for {x} and their {y}",
    "Build treemap chart using {x} for {y}",
    "Chart {x} by {y} in a treemap",
    "How does {y} spread over {x} in treemap",
    "Treemap using {x} and corresponding {y}",
    "Treemap view of {x} by {y}",
    "Treemap: visualize {y} grouped by {x}"
]


samples = []


for template in treemap_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: treemap, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: treemap, x: {x}, y: {y}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_treemap = pd.DataFrame(samples)


In [143]:
df = pd.concat([df, df_treemap], ignore_index=True)

In [144]:
df.shape

(32000, 2)

### 16 Waterfall Chart

In [145]:

waterfall_templates = [
    "Show waterfall chart for {y} by {x}",
    "Create a waterfall chart of {y} for each {x}",
    "Waterfall graph showing {y} over {x}",
    "Visualize {y} changes across {x} using waterfall",
    "Plot waterfall with {x} as steps and {y} as values",
    "How does {y} change over {x} in waterfall style?",
    "Build a waterfall breakdown of {y} by {x}",
    "Waterfall view for {y} with step {x}",
    "Generate waterfall chart using {x} and {y}",
    "Draw {x} vs {y} in waterfall chart",
    "Compare {y} through {x} stages using waterfall",
    "I want a waterfall of {y} for different {x}",
    "Give waterfall chart of {y} grouped by {x}",
    "Waterfall format for {y} divided by {x}",
    "Chart {y} by {x} in waterfall format"
]


samples = []


for template in waterfall_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: waterfall, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: waterfall, x: {x}, y: {y}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_waterfall = pd.DataFrame(samples)


In [146]:
df = pd.concat([df, df_waterfall], ignore_index=True)

In [147]:
df.shape

(33500, 2)

### 17  Funnel chart

In [148]:

funnel_templates = [
    "Show funnel chart for {y} by {x}",
    "Visualize {y} across {x} in funnel format",
    "Funnel view showing {x} stages with {y}",
    "Plot funnel chart of {x} vs {y}",
    "Create funnel chart using {x} and {y}",
    "Funnel of {y} distributed by {x}",
    "Give a funnel representation of {y} by {x}",
    "Chart {y} by {x} in funnel style",
    "Draw a funnel for {x} showing {y}",
    "Funnel visualization for {y} through {x}",
    "Compare {y} across {x} steps in a funnel",
    "Funnel for {x} showing drop in {y}",
    "Display funnel chart with {y} along {x}",
    "I want a funnel chart for {y} with {x}",
    "Breakdown of {y} through {x} in funnel view"
]


samples = []


for template in funnel_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: funnel, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: funnel, x: {x}, y: {y}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})

df_funnel = pd.DataFrame(samples)



In [149]:
df = pd.concat([df, df_funnel], ignore_index=True)

In [150]:
df.shape

(35000, 2)

### 18  Gauge Chart

In [151]:

gauge_y_options = [
    "satisfaction", "efficiency", "accuracy", "success rate", "score", "rating",
    "completion rate", "performance", "utilization", "conversion rate",
    "customer satisfaction", "coverage", "growth", "availability", "usage rate",
    "progress", "load", "response rate", "reach", "impact"
]


gauge_templates = [
    "Show gauge chart for {y}",
    "Gauge visualization of {y}",
    "Display {y} on a gauge",
    "Visualize {y} with a gauge chart",
    "Plot {y} in dial chart",
    "Represent {y} using gauge",
    "Give gauge representation of {y}",
    "Draw a gauge showing {y}",
    "Gauge format chart for {y}",
    "I want a gauge chart for {y}",
    "Chart {y} using a dial",
    "KPI gauge for {y}",
    "Indicator dial chart for {y}",
    "Use gauge to represent {y}",
    "Dial style visualization of {y}"
]

samples = []


for template in gauge_templates:
    for _ in range(samples_per_template):
        y = random.choice(gauge_y_options)
        input_text = template.format(y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: gauge, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: gauge, y: {y}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_gauge = pd.DataFrame(samples)


In [152]:
df = pd.concat([df, df_gauge], ignore_index=True)

In [153]:
df.shape

(36500, 2)

### 19 KPI Card / Metric

In [154]:

kpi_y_options = [
    "revenue", "sales", "profit", "budget", "expenses", "score", "rating",
    "satisfaction", "efficiency", "accuracy", "conversion rate", "performance",
    "success rate", "completion rate", "usage", "frequency", "views", "likes",
    "clicks", "downloads", "loss", "growth", "reach", "impact", "turnover"
]

kpi_templates = [
    "Show KPI card for {y}",
    "Display key metric of {y}",
    "Give a KPI card showing {y}",
    "Metric card for {y}",
    "I want a KPI for {y}",
    "Visualize {y} in metric format",
    "KPI card to show {y}",
    "Card view of {y}",
    "Highlight {y} in a KPI",
    "Show value of {y} in KPI format",
    "Indicator card for {y}",
    "Display current {y}",
    "Current value of {y} as card",
    "Simple KPI showing {y}",
    "What is the KPI for {y}?"
]


samples = []

for template in kpi_templates:
    for _ in range(samples_per_template):
        y = random.choice(kpi_y_options)
        input_text = template.format(y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: kpi, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: kpi, y: {y}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_kpi = pd.DataFrame(samples)


In [155]:
df = pd.concat([df, df_kpi], ignore_index=True)

In [156]:
df.shape

(38000, 2)

### 20 Combo Chart (Bar + Line on Dual Axis)

In [157]:


combo_templates = [
    "Show combo chart of {y1} and {y2} by {x}",
    "Draw bar and line chart of {y1} and {y2} grouped by {x}",
    "Combo chart: {y1} (bar) and {y2} (line) across {x}",
    "Visualize {y1} and {y2} over {x} using bar and line",
    "Create a combo bar-line chart for {y1} and {y2} vs {x}",
    "Plot {y1} and {y2} using bars and lines grouped by {x}",
    "Mixed chart showing {x} vs {y1} and {y2}",
    "Bar chart for {y1} with line for {y2} across {x}",
    "Make a dual-axis chart of {y1} and {y2} by {x}",
    "Chart: {y1} and {y2} on same x-axis ({x})"
]

samples = []

for template in combo_templates:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y1, y2 = random.sample(y_options, 2)  # Ensure different y1 and y2

        input_text = template.format(x=x, y1=y1, y2=y2)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: combo, x: {x}, y1: {y1}, y2: {y2}, filter: {filter_text}"
        else:
            target_text = f"chart: combo, x: {x}, y1: {y1}, y2: {y2}"
            
        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})

df_combo = pd.DataFrame(samples)


In [158]:
df = pd.concat([df, df_combo], ignore_index=True)

In [159]:
df.shape

(39000, 2)

### 21 Scatter Plot

In [160]:

numerical_y = [
    "sales", "revenue", "profit", "amount", "budget", "expenses", "score", "rating",
    "performance", "satisfaction", "accuracy", "growth", "downloads", "clicks", "views",
    "loss", "usage", "income", "conversion rate", "completion rate", "temperature",
    "pollution", "delay", "speed", "balance", "cases", "incidents", "response time"
]


scatter_templates = [
    "Show scatter plot of {x} vs {y}",
    "Draw scatter chart with {x} and {y}",
    "Scatter plot of {y} against {x}",
    "Visualize {x} versus {y} in scatter",
    "Make a scatter graph comparing {x} and {y}",
    "Chart {y} vs {x} as a scatter plot",
    "Plot {x} on x-axis and {y} on y-axis in scatter",
    "I want a scatter plot of {x} and {y}",
    "Graph showing scatter between {x} and {y}",
    "Scatter diagram for {x} and {y}"
]

samples = []

for template in scatter_templates:
    for _ in range(samples_per_template):
        x, y = random.sample(numerical_y, 2)  # different x and y

        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: scatter, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: scatter, x: {x}, y: {y}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_scatter = pd.DataFrame(samples)



In [161]:
df = pd.concat([df, df_scatter], ignore_index=True)

In [162]:
df.shape

(40000, 2)

### 22 Bubble Chart

In [163]:

numerical_cols = [
    "sales", "revenue", "profit", "amount", "budget", "expenses", "score", "rating",
    "performance", "satisfaction", "accuracy", "growth", "downloads", "clicks", "views",
    "loss", "usage", "income", "conversion rate", "completion rate", "temperature",
    "pollution", "delay", "speed", "balance", "cases", "incidents", "response time"
]


bubble_templates = [
    "Show bubble chart of {x} vs {y}, size by {size}",
    "Bubble chart with {x} and {y}, bubble size from {size}",
    "Draw a bubble plot for {x} against {y}, scaled by {size}",
    "Visualize {x} and {y} with bubble sizes representing {size}",
    "Make chart for {x} vs {y}, bubble size is {size}",
    "Plot bubble chart: {x}, {y}, size = {size}",
    "Graph with {x} on x-axis, {y} on y-axis, and {size} as size",
    "Compare {x} to {y} with {size} as bubble size",
    "Bubble plot where {size} shows size and {x} vs {y}",
    "Bubble graph for {x} against {y} with size by {size}"
]

samples = []

for template in bubble_templates:
    for _ in range(samples_per_template):
        x, y, size = random.sample(numerical_cols, 3)  # All different

        input_text = template.format(x=x, y=y, size=size)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: bubble, x: {x}, y: {y}, size: {size}, filter: {filter_text}"
        else:
            target_text = f"chart: bubble, x: {x}, y: {y}, size: {size}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        samples.append({"input": input_text, "target": target_text})


df_bubble = pd.DataFrame(samples)


In [164]:
df = pd.concat([df, df_bubble], ignore_index=True)

In [165]:
df.shape

(41000, 2)

### 23 Correlation Heatmap

In [166]:

heatmap_templates = [
    "Show a correlation heatmap",
    "Plot a heatmap of variable relationships",
    "Visualize relationships among variables in heatmap",
    "Generate heatmap of correlations in data",
    "Display correlation matrix in heatmap form",
    "Correlation heatmap of numerical features",
    "Matrix chart showing variable correlations",
    "Show correlation between all numeric fields",
    "Give a heatmap of all variable interdependence",
    "Create a variable relationship matrix",
    "Display data relationships as a heatmap",
    "Show variable correlation using heatmap chart",
    "Make correlation heatmap",
    "Chart to visualize correlation between all features",
    "Draw correlation map using heatmap"
]

samples = []


for template in heatmap_templates:
    for _ in range(samples_per_template):
        input_text = template

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: correlation_heatmap, filter: {filter_text}"
        else:
            target_text = "chart: correlation_heatmap"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_heatmap = pd.DataFrame(samples)


In [167]:
df = pd.concat([df, df_heatmap], ignore_index=True)

In [168]:
df.shape

(42500, 2)

### 24 Geo Scatter Map

In [169]:

geo_x_options = [
    "country", "state", "region", "city", "zone", "district", "province", "continent", "area"
]

geo_y_options = [
    "sales", "revenue", "cases", "amount", "budget", "views", "clicks", "population",
    "alerts", "downloads", "losses", "expenses", "income", "frequency"
]

geo_templates = [
    "Show geo scatter map of {y} by {x}",
    "Map showing {y} across {x}",
    "Plot {y} for each {x} on map",
    "Geo scatter: {y} vs {x}",
    "Scatter map visualizing {y} over {x}",
    "Visualize map of {x} with value {y}",
    "Display {y} geographically by {x}",
    "Geographic map of {x} showing {y}",
    "Plot {y} distribution across {x} on map",
    "Create map view of {y} by {x}",
    "Location map for {y} by {x}",
    "Map: {x} with bubble size as {y}",
    "Geo scatter map representing {y} by {x}",
    "World map showing {y} by {x}",
    "Scatter on map showing {x} and {y}"
]


samples = []



for template in geo_templates:
    for _ in range(samples_per_template):
        x = random.choice(geo_x_options)
        y = random.choice(geo_y_options)

        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: geo_scatter, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: geo_scatter, x: {x}, y: {y}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_geo = pd.DataFrame(samples)



In [170]:
df = pd.concat([df, df_geo], ignore_index=True)

In [171]:
df.shape

(44000, 2)

### 25 Choropleth Map

In [172]:

choropleth_x_options = [
    "country", "state", "region", "district", "zone", "province", "area", "continent"
]

choropleth_y_options = [
    "population", "sales", "revenue", "cases", "budget", "income", "amount", "losses",
    "downloads", "views", "frequency", "turnover", "alerts", "expenses"
]

choropleth_templates = [
    "Show choropleth map of {y} by {x}",
    "Map showing {y} across {x} (choropleth)",
    "Choropleth of {x} colored by {y}",
    "Display choropleth showing {y} for each {x}",
    "Map of {x} where color represents {y}",
    "Visualize {y} in {x} using choropleth",
    "Color regions of {x} based on {y}",
    "Plot choropleth for {x} and {y}",
    "Heatmap on map of {x} using {y}",
    "Map: {x} shaded by {y}",
    "Region-wise map of {y} by {x}",
    "Show region color map for {x} based on {y}",
    "Map displaying {y} intensity over {x}",
    "Draw a choropleth with {x} as regions and {y} as values",
    "Create choropleth of {x} with values from {y}"
]


samples = []

for template in choropleth_templates:
    for _ in range(samples_per_template):
        x = random.choice(choropleth_x_options)
        y = random.choice(choropleth_y_options)
        input_text = template.format(x=x, y=y)

        if random.random() < filter_probability:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: choropleth, x: {x}, y: {y}, filter: {filter_text}"
        else:
            target_text = f"chart: choropleth, x: {x}, y: {y}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})


df_choropleth = pd.DataFrame(samples)


In [173]:
df = pd.concat([df, df_choropleth], ignore_index=True)

In [174]:
df.shape

(45500, 2)

##   aggregative functions

### bar chart

In [175]:

aggregation_options = ["sum", "total", "average", "mean", "maximum", "minimum", "count"]

bar_templates_with_agg = [
    "Show bar chart of {agg} {y} by {x}",
    "Plot {agg} of {y} grouped by {x}",
    "Bar graph of {x} vs {agg} {y}",
    "Visualize {agg} {y} across {x}",
    "Bar chart showing {agg} {y} for each {x}",
    "Display bar of {y} by {x} with {agg}",
    "Bar chart grouped by {x} showing {agg} {y}"
]


samples = []
samples_per_template = 100  

for template in bar_templates_with_agg:
    for _ in range(samples_per_template):
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(aggregation_options)
        input_text = template.format(x=x, y=y, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target_text = f"chart: bar, x: {x}, y: {y}, agg: {agg}, filter: {filter_text}"
        else:
            target_text = f"chart: bar, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        samples.append({"input": input_text, "target": target_text})

df_ag_bar = pd.DataFrame(samples)



In [176]:
df = pd.concat([df, df_ag_bar], ignore_index=True)

In [177]:
df.shape

(46200, 2)

### grouped bar chart

In [178]:
grouped_bar_templates = [
    "Show grouped bar chart of {agg} {y} by {x} grouped by {group}",
    "Compare {agg} of {y} for each {x} and group by {group}",
    "Grouped bar chart: {agg} {y} over {x}, grouped by {group}",
    "Bar chart comparing {group} with {x} on {agg} {y}",
    "Plot {agg} {y} across {x} and separate by {group}",
]
group_options = ["region", "department", "gender", "platform", "campaign", "status", "program"]
grouped_samples = []

for template in grouped_bar_templates:
    for _ in range(100):  # total ~500
        x = random.choice(x_options)
        y = random.choice(y_options)
        group = random.choice(group_options)
        agg = random.choice(aggregation_options)
        input_text = template.format(x=x, y=y, group=group, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: grouped_bar, x: {x}, y: {y}, group: {group}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: grouped_bar, x: {x}, y: {y}, group: {group}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        grouped_samples.append({"input": input_text, "target": target})

      


### stacked bar

In [179]:
stacked_bar_templates = [
    "Show stacked bar chart for {agg} {y} by {x} stacked by {stack}",
    "Create a stacked bar with {x} and {agg} {y}, split by {stack}",
    "Stacked bar showing {agg} {y} across {x} and {stack}",
    "Display stacked bars of {agg} {y} per {x}, grouped by {stack}",
    "Plot {agg} of {y} by {x} stacked with {stack}"
]
stack_options = group_options  # Same list for stack/group
stacked_samples = []

for template in stacked_bar_templates:
    for _ in range(100):  # total ~500
        x = random.choice(x_options)
        y = random.choice(y_options)
        stack = random.choice(stack_options)
        agg = random.choice(aggregation_options)
        input_text = template.format(x=x, y=y, stack=stack, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: stacked_bar, x: {x}, y: {y}, stack: {stack}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: stacked_bar, x: {x}, y: {y}, stack: {stack}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        stacked_samples.append({"input": input_text, "target": target})


### Horizontal bar chart

In [180]:
horizontal_bar_templates = [
    "Show horizontal bar chart for {agg} {y} by {x}",
    "Horizontal bar of {agg} {y} grouped over {x}",
    "Plot {agg} of {y} using horizontal bars by {x}",
    "Display horizontal chart of {y} (aggregated as {agg}) by {x}",
    "Horizontal bar: compare {x} with {agg} of {y}"
]
horizontal_samples = []

for template in horizontal_bar_templates:
    for _ in range(100):  
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(aggregation_options)
        input_text = template.format(x=x, y=y, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: horizontal_bar, x: {x}, y: {y}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: horizontal_bar, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
            
        horizontal_samples.append({"input": input_text, "target": target})


In [181]:
all_samples = grouped_samples + stacked_samples + horizontal_samples

df_more_bars = pd.DataFrame(all_samples)


In [182]:
df = pd.concat([df, df_more_bars], ignore_index=True)

In [183]:
df.shape

(47700, 2)

### Line & Area Charts

Line Chart

Multi-Line Chart

Step Line Chart

Area Chart

Stacked Area Chart

In [184]:
line_templates = [
    "Show line chart of {agg} {y} over {x}",
    "Line graph for {agg} {y} by {x}",
    "Plot {agg} {y} along {x} using lines",
    "Draw a line chart showing {x} vs {agg} {y}",
    "Visualize {agg} {y} over time by {x}",
]

line_samples = []

for template in line_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(aggregation_options)
        input_text = template.format(x=x, y=y, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: line, x: {x}, y: {y}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: line, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        line_samples.append({"input": input_text, "target": target})


In [185]:
multi_line_templates = [
    "Multi-line chart of {agg} {y} by {x}, grouped by {group}",
    "Show lines for each {group} with {agg} {y} over {x}",
    "Plot multiple lines of {y} for each {group} along {x}",
    "Compare {agg} {y} by {group} across {x}",
    "Grouped line chart: {x}, {y} and {group}"
]

multi_line_samples = []

for template in multi_line_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        group = random.choice(group_options)
        agg = random.choice(aggregation_options)

        input_text = template.format(x=x, y=y, group=group, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: multi_line, x: {x}, y: {y}, group: {group}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: multi_line, x: {x}, y: {y}, group: {group}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        multi_line_samples.append({"input": input_text, "target": target})


In [186]:
step_templates = [
    "Draw step line chart for {agg} {y} over {x}",
    "Step plot for {agg} {y} vs {x}",
    "Create a step chart showing {x} and {agg} {y}",
    "Step line showing change in {agg} {y} across {x}",
    "Plot {agg} of {y} by {x} with steps"
]

step_samples = []

for template in step_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(aggregation_options)

        input_text = template.format(x=x, y=y, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: step_line, x: {x}, y: {y}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: step_line, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        step_samples.append({"input": input_text, "target": target})


In [187]:
area_templates = [
    "Area chart of {agg} {y} by {x}",
    "Show area graph for {agg} {y} over {x}",
    "Visualize {agg} of {y} using area chart along {x}",
    "Create area chart with {x} and {agg} {y}",
    "Draw {x} vs {agg} {y} in area format"
]

area_samples = []

for template in area_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(aggregation_options)

        input_text = template.format(x=x, y=y, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: area, x: {x}, y: {y}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: area, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        area_samples.append({"input": input_text, "target": target})


In [188]:
stacked_area_templates = [
    "Stacked area chart for {agg} {y} by {x} and {group}",
    "Create a stacked area graph using {x}, {agg} {y}, and stack by {group}",
    "Show stacked areas of {y} over {x}, split by {group}",
    "Area chart with stacking by {group} showing {agg} {y} per {x}",
    "Plot {agg} of {y} across {x} stacked by {group}"
]

stacked_area_samples = []

for template in stacked_area_templates:
    for _ in range(100):
        x = random.choice(x_options)
        y = random.choice(y_options)
        group = random.choice(group_options)
        agg = random.choice(aggregation_options)

        input_text = template.format(x=x, y=y, group=group, agg=agg)

        if random.random() < 0.4:
            filter_text = random.choice(filter_phrases)
            input_text = f"{input_text} {filter_text}"
            target = f"chart: stacked_area, x: {x}, y: {y}, group: {group}, agg: {agg}, filter: {filter_text}"
        else:
            target = f"chart: stacked_area, x: {x}, y: {y}, group: {group}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)

        stacked_area_samples.append({"input": input_text, "target": target})


In [189]:
all_line_area_samples = (
    line_samples +
    multi_line_samples +
    step_samples +
    area_samples +
    stacked_area_samples
)

df_line_area = pd.DataFrame(all_line_area_samples)


In [190]:
df = pd.concat([df, df_line_area], ignore_index=True)

In [191]:
df.shape

(50200, 2)

### Business & KPI Charts

Treemap

Waterfall Chart

Funnel Chart

Gauge Chart

KPI Card / Metric

Combo Chart (Bar + Line on Dual Axis)

In [192]:

business_charts = ["treemap", "waterfall", "funnel", "gauge", "kpi", "combo"]

agg_options = ["sum", "average", "mean", "max", "min", "count"]

combo_line_options = [
    "growth", "conversion", "success rate", "clicks"
]


treemap_templates = [
    "Treemap of {y} by {x}",
    "Show treemap for {x} grouped by {y}",
    "Create treemap with {x} and {y}",
]

waterfall_templates = [
    "Waterfall chart showing {y} by {x}",
    "Display a waterfall chart for {x} changes in {y}",
    "Plot waterfall chart: {x} and {y}"
]

funnel_templates = [
    "Funnel chart showing {y} for each {x}",
    "Create funnel chart of {x} stages with {y}",
    "Show conversion funnel: {x} with values {y}"
]

gauge_templates = [
    "Gauge chart showing {y}",
    "Visualize {y} as a gauge",
    "Gauge for tracking {y}"
]

kpi_templates = [
    "Show total {y}",
    "Display KPI card for {y}",
    "Metric card showing {y}"
]

combo_templates = [
    "Combo chart of {y1} and {y2} over {x}",
    "Bar and line chart: {y1} as bar and {y2} as line grouped by {x}",
    "Display {y1} and {y2} using combo chart by {x}"
]

# Combine all templates with chart name
chart_templates = []

chart_templates += [("treemap", t) for t in treemap_templates]
chart_templates += [("waterfall", t) for t in waterfall_templates]
chart_templates += [("funnel", t) for t in funnel_templates]
chart_templates += [("gauge", t) for t in gauge_templates]
chart_templates += [("kpi", t) for t in kpi_templates]
chart_templates += [("combo", t) for t in combo_templates]


samples = []

for chart, template in chart_templates:
    for _ in range(100):  # 100 samples per template
        x = random.choice(x_options)
        y = random.choice(y_options)
        agg = random.choice(agg_options)
        if chart == "gauge" or chart == "kpi":
            input_text = template.format(y=y)
            target_text = f"chart: {chart}, y: {y}, agg: {agg}"
        
        elif chart == "combo":
            y1 = random.choice(y_options)
            y2 = random.choice(combo_line_options)
            input_text = template.format(y1=y1, y2=y2, x=x)
            target_text = f"chart: combo, x: {x}, y1: {y1}, y2: {y2}, agg: {agg}"
        
        else:
            input_text = template.format(x=x, y=y)
            target_text = f"chart: {chart}, x: {x}, y: {y}, agg: {agg}"

        if random.random() < noise_probability:
            input_text = introduce_noise(input_text)
        
        samples.append({"input": input_text, "target": target_text})


df_ag = pd.DataFrame(samples)


In [193]:
df = pd.concat([df, df_ag], ignore_index=True)

In [194]:
df.shape

(52000, 2)

### Bubble chart

In [196]:

x_options = ["product", "region", "discount band", "category", "department"]
y_options = ["profit", "revenue", "growth", "conversion"]
size_options = ["sales", "clicks", "downloads", "amount"]

agg_methods = ["sum", "avg", "mean", "total"]

bubble_templates = [
    "Show a bubble chart of {y} vs {x}, where bubble size is {size} (use {agg})",
    "Bubble chart: x={x}, y={y}, size={size} aggregated by {agg}",
    "Plot {y} by {x} with bubble sized on {size} using {agg}",
    "Display {y} against {x}, bubble size = {agg} of {size}",
    "I want a bubble chart with {x} and {y}, bubble shows {agg} {size}"
]

samples = []
for _ in range(100):  
    x = random.choice(x_options)
    y = random.choice(y_options)
    size = random.choice(size_options)
    agg = random.choice(agg_methods)
    template = random.choice(bubble_templates)
    
    input_text = template.format(x=x, y=y, size=size, agg=agg)
    target_text = f"chart: bubble, x: {x}, y: {y}, size: {size}, agg: {agg}"
    
    samples.append({"input": input_text, "target": target_text})

df_ag_bubble = pd.DataFrame(samples)


In [197]:
df = pd.concat([df, df_ag_bubble], ignore_index=True)

In [198]:
df.shape

(52100, 2)

### maps

In [199]:

locations = ["country", "state", "city", "region", "district", "zone"]
metrics = ["sales", "revenue", "profit", "population", "income", "cases", "clicks", "downloads", "pollution"]
agg_methods = ["sum", "total", "avg", "mean"]

geo_templates = [
    "Show Geo Scatter Map with {agg} of {metric} by {location}",
    "Display a geo map with bubble size showing {agg} {metric} across {location}",
    "Map of {location} with bubble sized by {metric} using {agg}",
    "Geo scatter plot: location = {location}, size = {metric} aggregated as {agg}"
]

choro_templates = [
    "Show Choropleth map with {location} colored by {agg} of {metric}",
    "Color regions on map by {agg} {metric} across {location}",
    "Choropleth of {location} with gradient from {metric} ({agg})",
    "Map showing intensity of {metric} ({agg}) by {location}"
]

samples = []


for _ in range(300):
    location = random.choice(locations)
    metric = random.choice(metrics)
    agg = random.choice(agg_methods)
    template = random.choice(geo_templates)

    input_text = template.format(location=location, metric=metric, agg=agg)
    target_text = f"chart: geo_scatter, location: {location}, size: {metric}, agg: {agg}"
    samples.append({"input": input_text, "target": target_text})


for _ in range(300):
    location = random.choice(locations)
    metric = random.choice(metrics)
    agg = random.choice(agg_methods)
    template = random.choice(choro_templates)

    input_text = template.format(location=location, metric=metric, agg=agg)
    target_text = f"chart: choropleth, location: {location}, color: {metric}, agg: {agg}"
    samples.append({"input": input_text, "target": target_text})


df_map = pd.DataFrame(samples)



In [200]:
df = pd.concat([df, df_map], ignore_index=True)

In [201]:
df.shape

(52700, 2)

In [202]:

df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)


df_shuffled.to_csv("t5_final_dataset.csv", index=False)


## dataset for Filter

In [24]:
df1 = pd.read_csv(r"C:\Users\sanik\Downloads\filter_phrases_extended_1200.csv")

In [25]:
df1.head()

Unnamed: 0,input,column,operator,value
0,platform == Android,platform,==,Android
1,sales != 789,sales,!=,789
2,budget == 127,budget,==,127
3,amount between 424 and 464,amount,between,424-464
4,age < 808,age,<,808


In [26]:
df2 = pd.read_csv(r"C:\Users\sanik\Downloads\filter_dataset_6000_no_unknown.csv")

In [27]:
df2.head()

Unnamed: 0,input,column,operation,value
0,if mission equals Africa,mission,=,Africa
1,equal to 1500,year,=,1500
2,only for area more than iOS,area,>,iOS
3,zone greater than Tablet,zone,>,Tablet
4,variant greater than Female,variant,>,Female


In [28]:
df2 = df2.rename(columns={
    "operation": "operator",
    
})

In [29]:
combined_df = pd.concat([df1, df2], ignore_index=True).sample(frac=1).reset_index(drop=True)


In [30]:


combined_df["input_text"] = "Extract filters: " + combined_df["input"]
combined_df["target_text"] = (
    "column: " + combined_df["column"] +
    "; operator: " + combined_df["operator"] +
    "; value: " + combined_df["value"]
)


combined_df[["input_text", "target_text"]].to_csv("filter_dataset_for_tuning.csv", index=False)
