In [27]:
import pandas as pd
import os
from dotenv import load_dotenv
import requests

In [28]:
# Load sample dataset (Boston housing dataset)
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 
           'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
           'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, header=None, names=columns)

In [29]:
# Add artificial missing value (for testing purposes)
df.loc[0, 'RM'] = None

print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538    NaN  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [31]:
load_dotenv()
token = os.getenv("HF_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
headers = {"Authorization": f"Bearer {token}"}

# Load free Hugging Face model
# pipe = pipeline("text-generation",
#                 model="HuggingFaceH4/zephyr-7b-beta",
#                 token=token)

In [32]:
def create_prompt(row):
    prompt = f"""
    Analyze the following data row and identify any data quality issues, such as missing or inconsistent values. 
    Clearly state each issue and its location.

    Data row:
    {row.to_dict()}

    Issues:
    """
    return prompt


In [33]:
def query_hf(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [35]:
def check_quality(row):
    prompt = create_prompt(row)
    output = query_hf({
        "inputs": prompt,
        "parameters": {"max_new_tokens": 100, "temperature": 0.2}
    })

    generated_text = output[0]['generated_text']
    issues = generated_text.split('Issues:')[-1].strip()
    return issues

In [37]:
# Test with first row
row = df.iloc[0]
issues = check_quality(row)
print("Detected issues:", issues)

Detected issues: 1. Missing value for 'RM' (located at index 'RM')
    2. Inconsistent value for 'RM' as it should be a numerical value, but is currently 'nan' (located at index 'RM')
    3. Inconsistent value for 'DIS' as it should be a numerical value, but is currently '4.09' (located at index 'DIS')
    4. Inconsistent value for 'MEDV


In [38]:
import time

for idx, row in df.head(10).iterrows():
    print(f"Row {idx}:")
    print(check_quality(row))
    print('-' * 40)
    time.sleep(1)  # to avoid rate limits


Row 0:
1. Missing value for 'RM' (located at index 'RM')
    2. Inconsistent value for 'RM' as it should be a numerical value, but is currently 'nan' (located at index 'RM')
    3. Inconsistent value for 'DIS' as it should be a numerical value, but is currently '4.09' (located at index 'DIS')
    4. Inconsistent value for 'MEDV
----------------------------------------
Row 1:
1. Missing value: 'ZN' is missing a value. It should be a numerical value, but instead, it is 0.0. This could be due to a mistake in data entry or a missing value in the original dataset.
    2. Inconsistent value: 'CHAS' is also missing a value, but in this case, it is represented by a 0.0 instead of a null value. This could be a mistake in data cleaning or form
----------------------------------------
Row 2:
1. Missing value: 'ZN' is missing a value. This is located in the second column of the dictionary.
    2. Inconsistent value: 'CHAS' should be either 0 or 1, but it is currently 0.0. This is located in the th

In [39]:
results = []

for idx, row in df.head(10).iterrows():
    issues = check_quality(row)
    results.append({'index': idx, 'issues': issues})
    time.sleep(1)

results_df = pd.DataFrame(results)
results_df.to_csv("llm_data_quality_results.csv", index=False)
