# Generated Dataset Inspector

This notebook allows you to inspect the first few entries of the dataset which is stored as a Parquet file.


In [20]:
import pandas as pd
import os
import sys

In [14]:
def inspect_parquet(file_path, num_rows=5):
	"""
	Read and display the first few entries of a Parquet file.

	Args:
	    file_path (str): Path to the Parquet file
	    num_rows (int): Number of rows to display
	"""
	try:
		# Read the Parquet file
		df = pd.read_parquet(file_path)

		# Print basic information
		print(f'Dataset shape: {df.shape}')
		print(f'Columns: {df.columns.tolist()}')
		print(f'\nFirst {num_rows} rows:')
		print(df.head(num_rows))

		# Print data types
		print('\nData types:')
		print(df.dtypes)

		return df

	except Exception as e:
		print(f'Error reading Parquet file: {e}')

In [15]:
run_dir_path = '/matx/u/simonguo/triton_scrape_stack'
dataset_path = 'datasets/dataset.parquet'
full_dataset_path = os.path.join(run_dir_path, dataset_path)

assert os.path.exists(full_dataset_path), f'Dataset file does not exist: {full_dataset_path}'

In [16]:
num_rows = 5

In [17]:
df = inspect_parquet(full_dataset_path, num_rows)

Dataset shape: (4, 5)
Columns: ['entry_point', 'original_triton_code', 'python_code', 'triton_code', 'uuid']

First 5 rows:
               entry_point                               original_triton_code  \
0                Cartesian  # AOT ID: ['2_inference']\nfrom ctypes import ...   
1            CombineSlices  # AOT ID: ['3_inference']\nfrom ctypes import ...   
2  GlobalWeightedAvgPool2d  # AOT ID: ['3_forward']\nfrom ctypes import c_...   
3                    Model  # AOT ID: ['1_forward']\nfrom ctypes import c_...   

                                         python_code  \
0  import torch\nfrom torch import nn\nimport tor...   
1  import torch\nfrom torch import nn\nimport tor...   
2  import torch\nfrom torch import nn\n\n\nclass ...   
3  from torch.nn import Module\nimport torch\nimp...   

                                         triton_code  uuid  
0  import torch\nimport triton\nimport triton.lan...     0  
1  import torch\nimport triton\nimport triton.lan...     1  
2  imp

In [18]:
df

Unnamed: 0,entry_point,original_triton_code,python_code,triton_code,uuid
0,Cartesian,# AOT ID: ['2_inference']\nfrom ctypes import ...,import torch\nfrom torch import nn\nimport tor...,import torch\nimport triton\nimport triton.lan...,0
1,CombineSlices,# AOT ID: ['3_inference']\nfrom ctypes import ...,import torch\nfrom torch import nn\nimport tor...,import torch\nimport triton\nimport triton.lan...,1
2,GlobalWeightedAvgPool2d,# AOT ID: ['3_forward']\nfrom ctypes import c_...,import torch\nfrom torch import nn\n\n\nclass ...,import torch\nfrom torch._inductor.select_algo...,2
3,Model,# AOT ID: ['1_forward']\nfrom ctypes import c_...,from torch.nn import Module\nimport torch\nimp...,import torch\nimport triton\nimport triton.lan...,3


## Verify pairs of program actually work

In [33]:
num_samples = 2

In [34]:
sys.path.append(run_dir_path)  # Add the run directory to path
from run_and_check import evaluate_ref_and_kernel_correctness


sampled_indices = df.sample(n=num_samples, random_state=42).index.tolist()
print(f'Sampled indices: {sampled_indices}')

Sampled indices: [1, 3]


In [38]:
results = []
for idx in sampled_indices:
	entry = df.iloc[idx]
	print(f'\nEvaluating entry {idx}: {entry["entry_point"]}')

	# Extract the code columns - assuming these are the relevant columns
	# Adjust column names if needed based on your actual dataframe
	ref_code = entry.get('reference_code', entry.get('python_code'))
	kernel_code = entry.get('generated_code', entry.get('triton_code'))
	entry_point = entry['entry_point']

	print(f'Elavating Sample {idx} which is {entry_point} for Correctness')
	# print(ref_code)
	# print(kernel_code)
	# print(entry_point)

	try:
		# Evaluate correctness
		result = evaluate_ref_and_kernel_correctness(
			ref_arch_src=ref_code,
			kernel_src=kernel_code,
			entry_point=entry_point,
			num_trials=3,  # You can adjust this as needed
		)
		results.append({
			'index': idx,
			'entry_point': entry_point,
			'result': result,
			'success': True,
		})
	except Exception as e:
		print(f'Error during evaluation: {e}')
		results.append({
			'index': idx,
			'entry_point': entry_point,
			'error': str(e),
			'success': False,
		})


Evaluating entry 1: CombineSlices
Elavating Sample 1 which is CombineSlices

Evaluating entry 3: Model
Elavating Sample 3 which is Model


In [40]:
results_df = pd.DataFrame(results)
print('\nEvaluation Results Summary:')
print(results_df)
success_percentage = (results_df['success'].sum() / len(results_df)) * 100
print(f'\nSuccess Rate: {success_percentage:.2f}%')


Evaluation Results Summary:
   index    entry_point  result  success
0      1  CombineSlices    True     True
1      3          Model    True     True

Success Rate: 100.00%
