In [1]:
!pip install kaggle
!kaggle --version

Kaggle API 1.7.4.5


In [3]:
!kaggle datasets download -d csmalarkodi/liar-fake-news-dataset

Dataset URL: https://www.kaggle.com/datasets/csmalarkodi/liar-fake-news-dataset
License(s): unknown
Downloading liar-fake-news-dataset.zip to /home/alejandro/Desktop/UNI/ERASMUS/BigData/TruthLens
  0%|                                               | 0.00/0.98M [00:00<?, ?B/s]
100%|███████████████████████████████████████| 0.98M/0.98M [00:00<00:00, 883MB/s]


In [4]:
!unzip -q liar-fake-news-dataset.zip
!ls -1
# you should see: 
#   README.md
#   test.tsv
#   train.tsv
#   valid.tsv


data
liar-fake-news-dataset.zip
README
test.tsv
train.tsv
Untitled.ipynb
valid.tsv


In [6]:
import pandas as pd

# adjust sep='\t' for TSV
train = pd.read_csv("train.tsv", sep="\t", header=None)
valid = pd.read_csv("valid.tsv", sep="\t", header=None)
test  = pd.read_csv("test.tsv", sep="\t", header=None)

# assign column names (per LIAR README)
cols = ['id','label','statement','subject','speaker','job_title',
        'state_info','party_affiliation','barely_true_counts',
        'false_counts','half_true_counts','mostly_true_counts',
        'pants_on_fire_counts','context']
train.columns = cols
valid.columns = cols
test.columns  = cols

# Peek
train[['statement','label']].head()


Unnamed: 0,statement,label
0,Says the Annies List political group supports ...,false
1,When did the decline of coal start? It started...,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true
3,Health care reform legislation is likely to ma...,false
4,The economic turnaround started at the end of ...,half-true


In [7]:
# Quedarnos solo con la afirmación y la etiqueta
df = pd.concat([train, valid, test], ignore_index=True)[['statement','label']]

# Elimina filas donde statement sea NaN o vacío
df = df[df['statement'].notna() & (df['statement'].str.strip() != "")]
df.shape  # debe dar algo cercano a 12000 filas


(12791, 2)

In [12]:
def map_label(x):
    return 'real' if x in ['true','mostly-true','half-true'] else 'fake'

df['mapped'] = df['label'].apply(map_label)
df[['statement','label','mapped']].head(5)


Unnamed: 0,statement,label,mapped
0,Says the Annies List political group supports ...,false,fake
1,When did the decline of coal start? It started...,half-true,real
2,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,real
3,Health care reform legislation is likely to ma...,false,fake
4,The economic turnaround started at the end of ...,half-true,real


In [14]:
df_sample = df.sample(200, random_state=42).reset_index(drop=True)
df_sample.to_csv("sample_liar.csv", index=False)
df_sample.head(5)


Unnamed: 0,statement,label,mapped
0,"In the event of a U.S. strike on Syria, the Ru...",barely-true,fake
1,To give the proposed economic stimulus plan so...,true,real
2,Its warmer on Mars than it is in parts of the ...,barely-true,fake
3,The health insurance plan that (members of Con...,true,real
4,He's promising four more years of an administr...,false,fake


In [41]:
from sklearn.metrics import accuracy_score
# Prepara todos los textos de la muestra
texts = df_sample["statement"].tolist()
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

# Ejecuta el pipeline en un solo batch
results = classifier(
    sequences=texts,
    candidate_labels=["Real","Fake"],
    multi_label=False,
    batch_size=32            # prueba distintos tamaños: 8, 16, 32…
)

# Extrae etiquetas y scores
labels = [r["labels"][0].lower() for r in results]
scores = [r["scores"][0]          for r in results]

df_sample["predicted"] = labels
df_sample["score"]     = scores

df_sample.head(10)[["statement","mapped","predicted","score"]]


y_true = df_sample["mapped"]
y_pred = df_sample["predicted"]

acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.2%}")
print("Etiquetas reales (mapped):", df_sample["mapped"].unique())
print("Etiquetas predictas   :", df_sample["predicted"].unique())



Accuracy: 59.00%
Etiquetas reales (mapped): ['fake' 'real']
Etiquetas predictas   : ['real' 'fake']


In [47]:
from sklearn.metrics import confusion_matrix
#Matriz muestra que el llm decalara falsas  afirmaciones verdaderas|              
#               | Predicho “real” | Predicho “fake” |
#| ------------ | --------------- | --------------- |
#| **Real (V)** | 112             | 2               |
#| **Fake (F)** | 80              | 6               |

cm = confusion_matrix(df_sample["mapped"], df_sample["predicted"], labels=["real","fake"])
print(cm)
df_sample[df_sample["mapped"] != df_sample["predicted"]][
  ["statement","mapped","predicted","score"]
].head(10)

[[112   2]
 [ 80   6]]


Unnamed: 0,statement,mapped,predicted,score
0,"In the event of a U.S. strike on Syria, the Ru...",fake,real,0.982898
2,Its warmer on Mars than it is in parts of the ...,fake,real,0.515759
4,He's promising four more years of an administr...,fake,real,0.783482
6,Theres no plan from Senate Democrats or the Wh...,fake,real,0.67344
8,About 47 percent of able-bodied people in the ...,fake,real,0.925287
12,Says Lamar Smith sponsored the bill that funde...,fake,real,0.864764
13,Hispanic unemployment has been ticking down fr...,fake,real,0.956958
14,"Romney left Massachusetts ""with a $245-million...",fake,real,0.87813
16,"Ive created over 40,000 jobs.",fake,real,0.923287
19,Wisconsin election officials to accept Mickey ...,fake,real,0.618875


In [49]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_true = df_sample["mapped"]
y_pred = df_sample["predicted"]

print("Precision:", precision_score(y_true, y_pred, pos_label="real"))
print("Recall   :", recall_score(y_true, y_pred, pos_label="real"))
print("F1-score :", f1_score(y_true, y_pred, pos_label="real"))

Precision: 0.5833333333333334
Recall   : 0.9824561403508771
F1-score : 0.7320261437908496


In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=["real","fake"]))

              precision    recall  f1-score   support

        real       0.75      0.07      0.13        86
        fake       0.58      0.98      0.73       114

    accuracy                           0.59       200
   macro avg       0.67      0.53      0.43       200
weighted avg       0.66      0.59      0.47       200



In [53]:
import gradio as gr

def fact_check(text):
    result = classifier(
        sequences=text,
        candidate_labels=["Real","Fake"],
        multi_label=False
    )
    label     = result["labels"][0].lower()
    confidence= result["scores"][0]
    return label, confidence


demo = gr.Interface(
    fn=fact_check,
    inputs=gr.Textbox(lines=2, placeholder="Escribe tu afirmación…"),
    outputs=[
      gr.Label(num_top_classes=2, label="Prediction"),
      gr.Textbox(label="Confidence")
    ],
    title="TruthLens Fact-Checker",
    description="Introduce una afirmación y recibe veracidad y nivel de confianza"
)

demo.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/gradio/blocks.py", line 2191, in process_api
    data = await self.postprocess_data(block_fn, result["prediction"], state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alejandro/anaconda3/lib/python3.11/site-packages/gradio/blocks.py", line 1918, in postprocess_data
    self.validate_outputs(block_fn, predictions)  # type: ignore
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alejandro/anaconda3/lib/python3