In [1]:
# Import required libraries
import pandas as pd
import json
from pathlib import Path

# Load the FEVER dataset manually since the evidence field has mixed types
data_path = Path('../data/fever.jsonl')
fever_data = []

with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        fever_data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(fever_data)

# Print basic dataset info
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df['label'].value_counts())

# Display a few examples
print("\nSample rows:")
print(df[['claim', 'label', 'verifiable']].head())

Dataset shape: (145449, 5)

Columns: ['id', 'verifiable', 'label', 'claim', 'evidence']

Label distribution:
label
SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: count, dtype: int64

Sample rows:
                                               claim            label  \
0  Nikolaj Coster-Waldau worked with the Fox Broa...         SUPPORTS   
1                 Roman Atwood is a content creator.         SUPPORTS   
2  History of art includes architecture, dance, s...         SUPPORTS   
3                  Adrienne Bailon is an accountant.          REFUTES   
4       System of a Down briefly disbanded in limbo.  NOT ENOUGH INFO   

       verifiable  
0      VERIFIABLE  
1      VERIFIABLE  
2      VERIFIABLE  
3      VERIFIABLE  
4  NOT VERIFIABLE  


In [3]:
from sklearn.model_selection import train_test_split

# We'll use an 80/20 split, as planned.
# stratify=df['label'] is important for imbalanced datasets!
# random_state=42 ensures you get the same "random" split every time.
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

print(f"Total rows: {len(df)}")
print(f"Training rows: {len(train_df)}")
print(f"Validation rows: {len(val_df)}")

print("\nTraining label distribution:")
print(train_df['label'].value_counts(normalize=True))
print("\nValidation label distribution:")
print(val_df['label'].value_counts(normalize=True))

Total rows: 145449
Training rows: 116359
Validation rows: 29090

Training label distribution:
label
SUPPORTS           0.550263
NOT ENOUGH INFO    0.245026
REFUTES            0.204711
Name: proportion, dtype: float64

Validation label distribution:
label
SUPPORTS           0.550258
NOT ENOUGH INFO    0.245033
REFUTES            0.204710
Name: proportion, dtype: float64


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Data Preparation ---
print("Preparing data...")
X_train = train_df['claim']
y_train = train_df['label']
    
X_val = val_df['claim']
y_val = val_df['label']

print(f"Training on {len(X_train)} examples.")
print(f"Validating on {len(X_val)} examples.")


# --- 2. Model Pipeline ---
# Step 1: The text-to-number converter
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') 
# (stop_words='english' ignores common words like 'the', 'is', 'a')

# Step 2: The classifier "brain"
svm_classifier = LinearSVC(dual=True, max_iter=1000) 

# Create the pipeline that does Step 1 then Step 2
model = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', svm_classifier),
])

print("\nModel pipeline created.")


# --- 3. Train the Model ---
print("\nTraining the baseline model... (This may take 1-2 minutes)")
model.fit(X_train, y_train)
print("Training complete!")


# --- 4. Evaluate the Model ---
print("\nMaking predictions on the validation data...")
y_pred = model.predict(X_val)

# Calculate our scores
accuracy = accuracy_score(y_val, y_pred)
macro_f1 = f1_score(y_val, y_pred, average='macro')

print("\n--- Week 1 Baseline Results ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Macro-F1 Score: {macro_f1:.4f}")
print("---------------------------------")

Preparing data...
Training on 116359 examples.
Validating on 29090 examples.

Model pipeline created.

Training the baseline model... (This may take 1-2 minutes)
Training complete!

Making predictions on the validation data...
Training complete!

Making predictions on the validation data...

--- Week 1 Baseline Results ---
Accuracy: 59.88%
Macro-F1 Score: 0.4583
---------------------------------

--- Week 1 Baseline Results ---
Accuracy: 59.88%
Macro-F1 Score: 0.4583
---------------------------------
