Imports

In [3]:
import pandas as pd
import os
from dotenv import load_dotenv
import requests

In [4]:
# Load sample dataset (Boston housing dataset)
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 
           'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
           'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, header=None, names=columns)

In [5]:
# Add artificial missing value (for testing purposes)
df.loc[0, 'RM'] = None

print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538    NaN  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


Loading API Key from ENV File

In [None]:
load_dotenv()
token = os.getenv("HF_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
headers = {"Authorization": f"Bearer {token}"}

Function for prompt to send to LLM

In [7]:
def create_prompt(row):
    prompt = f"""
    Analyze the following data row and identify any data quality issues, such as missing or inconsistent values. 
    Clearly state each issue and its location.

    Data row:
    {row.to_dict()}

    Issues:
    """
    return prompt


In [8]:
def query_hf(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

Receive the errors from LLM

In [26]:
def check_quality(row):
    prompt = create_prompt(row)
    output = query_hf({
        "inputs": prompt,
        "parameters": {"max_new_tokens": 100, "temperature": 0.2}
    })

    generated_text = output[0]['generated_text']
    issues = generated_text.split('Issues:')[-1].strip()
    return issues

In [27]:
# Test with first row
row = df.iloc[0]
issues = check_quality(row)
print("Detected issues:", issues)

Detected issues: 1. Missing value in 'RM' column. Location: 'RM' column.
    2. 'RM' column is not present in the header row, which could indicate a potential naming error or missing column.
    3. 'MEDV' value is rounded to two decimal places, which could indicate potential data rounding errors or inconsistencies.
    4. 'MEDV' value is an outlier, as it is significantly lower than the other values in


In [28]:
import time

for idx, row in df.head(10).iterrows():
    print(f"Row {idx}:")
    print(check_quality(row))
    print('-' * 40)
    time.sleep(1)  # to avoid rate limits


Row 0:
1. Missing value in 'RM' column. Location: 'RM' column.
    2. 'RM' column is not present in the header row, which could indicate a potential naming error or missing column.
    3. 'MEDV' value is rounded to two decimal places, which could indicate potential data rounding errors or inconsistencies.
    4. 'MEDV' value is an outlier, as it is significantly lower than the other values in
----------------------------------------
Row 1:
1. Missing value: 'ZN' is missing a value. This is located at the second key-value pair in the dictionary.
    2. Inconsistent value: 'NOX' has a value of 0.469, which is not a valid numerical value for NOX levels. This is located at the sixth key-value pair in the dictionary.
    3. Inconsistent value: 'LSTAT' has a value of 9.1
----------------------------------------
Row 2:
1. Missing value: 'ZN' is missing a value. Its location is in the dictionary key 'ZN'.
    2. Inconsistent value: 'CHAS' should be either 0 or 1, but it is 0.0. Its location is

In [12]:
results = []

for idx, row in df.head(10).iterrows():
    issues = check_quality(row)
    results.append({'index': idx, 'issues': issues})
    time.sleep(1)

results_df = pd.DataFrame(results)
results_df.to_csv("llm_data_quality_results.csv", index=False)


In [13]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination=0.1)
df_numeric = df.select_dtypes(include=['float64', 'int'])
df['anomaly'] = clf.fit_predict(df_numeric)


print(df)


        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31     0  0.538    NaN  65.2  4.0900    1  296.0   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1  273.0   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1  273.0   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1  273.0   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1  273.0   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1  273.0   

     PTRATIO       B  LSTAT  MEDV  anomaly  
0       15.3  396.90   4.98  24.0        1  
1       1

In [14]:
print(df[df['anomaly'] == -1])


         CRIM    ZN  INDUS  CHAS     NOX     RM    AGE      DIS  RAD    TAX  \
54    0.01360  75.0   4.00     0  0.4100  5.888   47.6   7.3197    3  469.0   
55    0.01311  90.0   1.22     0  0.4030  7.249   21.9   8.6966    5  226.0   
142   3.32105   0.0  19.58     1  0.8710  5.403  100.0   1.3216    5  403.0   
144   2.77974   0.0  19.58     0  0.8710  4.903   97.8   1.3459    5  403.0   
145   2.37934   0.0  19.58     0  0.8710  6.130  100.0   1.4191    5  403.0   
152   1.12658   0.0  19.58     1  0.8710  5.012   88.0   1.6102    5  403.0   
154   1.41385   0.0  19.58     1  0.8710  6.129   96.0   1.7494    5  403.0   
155   3.53501   0.0  19.58     1  0.8710  6.152   82.6   1.7455    5  403.0   
156   2.44668   0.0  19.58     0  0.8710  5.272   94.0   1.7364    5  403.0   
162   1.83377   0.0  19.58     1  0.6050  7.802   98.2   2.0407    5  403.0   
163   1.51902   0.0  19.58     1  0.6050  8.375   93.9   2.1620    5  403.0   
195   0.01381  80.0   0.46     0  0.4220  7.875   32

In [15]:
from datasets import load_dataset

# Load JSONL as Hugging Face Dataset
dataset = load_dataset("json", data_files="llm_data_quality_training_set.jsonl", split="train")

# Optional: Split into train and test
dataset = dataset.train_test_split(test_size=0.1)


In [16]:
def format_zephyr(example):
    return {
        "text": f"<|system|>\nYou are a helpful data quality assistant.\n<|user|>\n{example['prompt']}<|assistant|>\n{example['response']}"
    }

dataset = dataset.map(format_zephyr)


Map: 100%|██████████| 94/94 [00:00<00:00, 7252.84 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 1131.98 examples/s]


In [17]:
from transformers import AutoTokenizer

model_id = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized = dataset.map(tokenize, batched=True)


Map: 100%|██████████| 94/94 [00:00<00:00, 2456.65 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 3527.32 examples/s]


In [18]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)


Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.15it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [19]:
import transformers
print(transformers.__version__)


4.51.0


In [20]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


transformers.training_args


In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./zephyr-quality-checker",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    eval_strategy="epoch",   # ✅ spelling must be exact
    save_strategy="epoch",         # ✅ this too
    logging_steps=10,
    save_total_limit=2
)


In [23]:
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

  trainer = Trainer(
You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [None]:
trainer.train()


In [None]:
input_text = "<|system|>\nYou are a helpful data quality assistant.\n<|user|>\nAnalyze the following row: {'RM': None, 'TAX': 300, 'RAD': 24}<|assistant|>\n"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
trainer.save_model("fine-tuned-zephyr")
tokenizer.save_pretrained("fine-tuned-zephyr")
