# Raw Data

In [None]:
import pandas as pd
# imports for base SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

#imports for cleaning data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# imports for advanced SVM (transforming features from categorical to numerical)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.decomposition import TruncatedSVD

In [None]:
# rename columns for ease
column_names = [
    'id',                # Column 1
    'label',             # Column 2
    'statement',         # Column 3
    'subject',           # Column 4
    'speaker',           # Column 5
    'job_title',         # Column 6
    'state_info',        # Column 7
    'party_affiliation', # Column 8
    'barely_true_counts',# Column 9
    'false_counts',      # Column 10
    'half_true_counts',  # Column 11
    'mostly_true_counts',# Column 12
    'pants_on_fire_counts',# Column 13
    'context'            # Column 14
]

# load datasets
train_df = pd.read_csv('train.tsv', sep='\t', header=None, names=column_names)
valid_df = pd.read_csv('valid.tsv', sep='\t', header=None, names=column_names)
test_df = pd.read_csv('test.tsv', sep='\t', header=None, names=column_names)

# extract x and y for each set, keep only our statement and label (target)
# will add more columns later
X_train = train_df['statement']
y_train = train_df['label']

X_valid = valid_df['statement']
y_valid = valid_df['label']

X_test = test_df['statement']
y_test = test_df['label']

print(f"Training Set: {X_train.shape[0]} samples")
print(f"Validation Set: {X_valid.shape[0]} samples")
print(f"Test Set: {X_test.shape[0]} samples")

print("\nFirst Sample:")
print(f"Label: {y_train.iloc[0]}")
print(f"Statement: {X_train.iloc[0]}")

Data Loaded Successfully!
Training Set: 10240 samples
Validation Set: 1284 samples
Test Set: 1267 samples

First Sample:
Label: false
Statement: Says the Annies List political group supports third-trimester abortions on demand.


In [None]:
# use TF-IDF to reduce common words like "of" and "the"
tfidf = TfidfVectorizer()

# fit and transform data using sklearn
X_train_vectors = tfidf.fit_transform(X_train)
X_test_vectors = tfidf.transform(X_test)

# keeps random state the same, remove this if want "more" random
clf = LinearSVC(random_state=31)

# train and test model
clf.fit(X_train_vectors, y_train)
predictions = clf.predict(X_test_vectors)

# print results
print("\nResults")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))


Results
Accuracy: 24.23%

Classification Report:
              precision    recall  f1-score   support

 barely-true       0.24      0.23      0.23       212
       false       0.29      0.31      0.30       249
   half-true       0.23      0.23      0.23       265
 mostly-true       0.20      0.21      0.21       241
  pants-fire       0.21      0.16      0.18        92
        true       0.25      0.27      0.26       208

    accuracy                           0.24      1267
   macro avg       0.24      0.23      0.24      1267
weighted avg       0.24      0.24      0.24      1267



1. Overall Accuray - 24.23%

Out of every 100 example news articles, the model predicted the correct label for about 24. Random guessing would be about 16%, so the baseline SVM model is doing decent.

The biggest challenge with having 6 results if the difference between labels like "half-true" and "mostly-true", which could just be the difference of opinions between people. Not surprsing a text-only model struggles with this.



# Cleaned Data

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def clean_text(text):
    # convert all text to lowercase
    text = str(text).lower()

    # replace numbers with 'num'
    text = re.sub(r'\d+', 'num', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize
    words = text.split()

    # Remove Stopwords & Lemmatize

    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

In [None]:
X_train_clean = X_train.apply(clean_text)
X_test_clean = X_test.apply(clean_text)

# run pipeline
tfidf_clean = TfidfVectorizer(min_df=3)

# vectorizing data
X_train_vectors_clean = tfidf_clean.fit_transform(X_train_clean)
X_test_vectors_clean = tfidf_clean.transform(X_test_clean)

# training
clf_clean = LinearSVC(random_state=42, max_iter=10000) # Increased max_iter for convergence
clf_clean.fit(X_train_vectors_clean, y_train)

# predicting
predictions_clean = clf_clean.predict(X_test_vectors_clean)

In [None]:
print("\nCleaned results")
print(f"Accuracy: {accuracy_score(y_test, predictions_clean):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, predictions_clean))


Cleaned results
Accuracy: 21.07%

Classification Report:
              precision    recall  f1-score   support

 barely-true       0.18      0.18      0.18       212
       false       0.22      0.22      0.22       249
   half-true       0.24      0.23      0.23       265
 mostly-true       0.19      0.20      0.20       241
  pants-fire       0.19      0.16      0.18        92
        true       0.22      0.23      0.22       208

    accuracy                           0.21      1267
   macro avg       0.21      0.21      0.21      1267
weighted avg       0.21      0.21      0.21      1267



# Advanced SVM (more categories)

In [None]:
# define features we want to use
text_feature = 'statement'
categorical_features = ['subject', 'speaker', 'job_title', 'state_info', 'party_affiliation']
numerical_features = ['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']

# use TF-IDF
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        min_df=3,
        stop_words='english',
        ngram_range=(1, 2),    #
        sublinear_tf=True,
        max_df=0.9             # drop most common tokens
    )),
    ('svd', TruncatedSVD(n_components=500, random_state=31))
])

# convert categorical data using one hot encoding
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# numerical transform, standardized
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# create full pipeline (preprocessing -> training)
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('cat', cat_transformer, categorical_features),
        ('num', num_transformer, numerical_features)
    ])

model_advanced = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(C=0.1,random_state=31, max_iter=100000))
])

# train and predict
X_train_full = train_df.drop('label', axis=1)
X_test_full = test_df.drop('label', axis=1)

# cast both as float
X_train_full[numerical_features] = X_train_full[numerical_features].astype(float)
X_test_full[numerical_features] = X_test_full[numerical_features].astype(float)

model_advanced.fit(X_train_full, y_train)
predictions_adv = model_advanced.predict(X_test_full)

In [2]:
print("\nResults")
print(f"Accuracy: {accuracy_score(y_test, predictions_adv):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, predictions_adv))


--- ADVANCED PIPELINE RESULTS (Text + Metadata) ---


NameError: name 'accuracy_score' is not defined