In [19]:
import pandas as pd
import os
from dotenv import load_dotenv
import requests

In [20]:
# Load sample dataset (Boston housing dataset)
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 
           'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
           'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, header=None, names=columns)

In [21]:
# Add artificial missing value (for testing purposes)
df.loc[0, 'RM'] = None

print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538    NaN  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [22]:
load_dotenv()
token = os.getenv("HF_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
headers = {"Authorization": f"Bearer {token}"}

# Load free Hugging Face model
# pipe = pipeline("text-generation",
#                 model="HuggingFaceH4/zephyr-7b-beta",
#                 token=token)

In [23]:
def create_prompt(row):
    prompt = f"""
    Analyze the following data row and identify any data quality issues, such as missing or inconsistent values. 
    Clearly state each issue and its location.

    Data row:
    {row.to_dict()}

    Issues:
    """
    return prompt


In [24]:
def query_hf(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [25]:
def check_quality(row):
    prompt = create_prompt(row)
    output = query_hf({
        "inputs": prompt,
        "parameters": {"max_new_tokens": 100, "temperature": 0.2}
    })

    generated_text = output[0]['generated_text']
    issues = generated_text.split('Issues:')[-1].strip()
    return issues

In [26]:
# Test with first row
row = df.iloc[0]
issues = check_quality(row)
print("Detected issues:", issues)

Detected issues: 1. Missing value for 'RM' (located at index 'RM')
    2. Inconsistent value for 'RM' in previous rows (this data row is part of a larger dataset, and 'RM' values for nearby rows are not missing)
    3. Inconsistent value for 'MEDV' (24.0 is significantly lower than the expected median value for this area)
    4. Inconsistent value for 'LSTAT' (4


In [27]:
import time

for idx, row in df.head(10).iterrows():
    print(f"Row {idx}:")
    print(check_quality(row))
    print('-' * 40)
    time.sleep(1)  # to avoid rate limits


Row 0:
1. Missing value for 'RM' (located at index 'RM')
    2. Inconsistent value for 'RM' in previous rows (this data row is part of a larger dataset, and 'RM' values for nearby rows are not missing)
    3. Inconsistent value for 'MEDV' (24.0 is significantly lower than the expected median value for this area)
    4. Inconsistent value for 'LSTAT' (4
----------------------------------------
Row 1:
1. Missing value: 'ZN' is missing a value. This is located in the second column of the dictionary.
    2. Inconsistent value: 'CHAS' has a value of 0.0, but this value is not consistent with the other data points in the dataset. 'CHAS' is a binary feature that should have a value of either 0 or 1. This is located in the third column of the dictionary.
    3. Miss
----------------------------------------
Row 2:
1. Missing value: 'ZN' is missing a value. This is located in the second column of the dictionary.
    2. Inconsistent value: 'CHAS' should be either 0 or 1, but it is currently 0.0. 

In [28]:
results = []

for idx, row in df.head(10).iterrows():
    issues = check_quality(row)
    results.append({'index': idx, 'issues': issues})
    time.sleep(1)

results_df = pd.DataFrame(results)
results_df.to_csv("llm_data_quality_results.csv", index=False)


In [29]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination=0.1)
df_numeric = df.select_dtypes(include=['float64', 'int'])
df['anomaly'] = clf.fit_predict(df_numeric)


print(df)


        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31     0  0.538    NaN  65.2  4.0900    1  296.0   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1  273.0   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1  273.0   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1  273.0   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1  273.0   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1  273.0   

     PTRATIO       B  LSTAT  MEDV  anomaly  
0       15.3  396.90   4.98  24.0        1  
1       1

In [30]:
print(df[df['anomaly'] == -1])


         CRIM    ZN  INDUS  CHAS     NOX     RM    AGE      DIS  RAD    TAX  \
141   1.62864   0.0  21.89     0  0.6240  5.019  100.0   1.4394    4  437.0   
142   3.32105   0.0  19.58     1  0.8710  5.403  100.0   1.3216    5  403.0   
143   4.09740   0.0  19.58     0  0.8710  5.468  100.0   1.4118    5  403.0   
144   2.77974   0.0  19.58     0  0.8710  4.903   97.8   1.3459    5  403.0   
145   2.37934   0.0  19.58     0  0.8710  6.130  100.0   1.4191    5  403.0   
146   2.15505   0.0  19.58     0  0.8710  5.628  100.0   1.5166    5  403.0   
147   2.36862   0.0  19.58     0  0.8710  4.926   95.7   1.4608    5  403.0   
148   2.33099   0.0  19.58     0  0.8710  5.186   93.8   1.5296    5  403.0   
152   1.12658   0.0  19.58     1  0.8710  5.012   88.0   1.6102    5  403.0   
154   1.41385   0.0  19.58     1  0.8710  6.129   96.0   1.7494    5  403.0   
155   3.53501   0.0  19.58     1  0.8710  6.152   82.6   1.7455    5  403.0   
156   2.44668   0.0  19.58     0  0.8710  5.272   94