In [1]:
import pandas as pd
import torch
import numpy as np
from textblob import TextBlob
from transformers import AutoTokenizer
from datasets import Dataset as HFDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ Load dataset paths
train_path = "data/train.tsv"
valid_path = "data/valid.tsv"
test_path = "data/test.tsv"

In [3]:
# ✅ Label mapping (Convert to Binary: 1=True, 0=False)
LABEL_MAPPING = {
    "true": 1, "mostly-true": 1,
    "half-true": 0, "barely-true": 0, "false": 0, "pants-fire": 0
}

In [4]:
# ✅ Function to preprocess dataset
def preprocess_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None)
    df.columns = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                  "context1", "context2", "context3", "context4", "context5", "source"]
    
    # Convert Labels to Binary (1=True, 0=False)
    df["label"] = df["label"].map(LABEL_MAPPING)
    
    # Handle missing values (replace empty cells with "unknown")
    df.fillna("unknown", inplace=True)

    # Add Sentiment Score (extra feature)
    df["sentiment"] = df["statement"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

    return df


In [5]:
# ✅ Load & Process Datasets
train_df = preprocess_data(train_path)
valid_df = preprocess_data(valid_path)
test_df = preprocess_data(test_path)

# ✅ Print dataset stats
print(f"Train Shape: {train_df.shape}, Valid Shape: {valid_df.shape}, Test Shape: {test_df.shape}")
print(train_df.head())

Train Shape: (10240, 15), Valid Shape: (1284, 15), Test Shape: (1267, 15)
           id  label                                          statement  \
0   2635.json      0  Says the Annies List political group supports ...   
1  10540.json      0  When did the decline of coal start? It started...   
2    324.json      1  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      0  Health care reform legislation is likely to ma...   
4   9028.json      0  The economic turnaround started at the end of ...   

                              subject         speaker                   job  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting               unknown   
4                        economy,jobs   charlie-crist               unknown   

