# Binary SVM

In [3]:
# imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD

In [4]:
# define column names
column_names = [
    'id', 'label', 'statement', 'subject', 'speaker', 'job_title',
    'state_info', 'party_affiliation', 'barely_true_counts',
    'false_counts', 'half_true_counts', 'mostly_true_counts',
    'pants_on_fire_counts', 'context'
]

# load datasets
train_df = pd.read_csv('train.tsv', sep='\t', header=None, names=column_names)
test_df = pd.read_csv('test.tsv', sep='\t', header=None, names=column_names)
valid_df = pd.read_csv('valid.tsv', sep='\t', header=None, names=column_names)

def group_rare_categories(df, column, threshold=10):
    counts = df[column].value_counts()
    replacements = counts[counts < threshold].index
    return df[column].replace(replacements, 'other')

speaker_counts = train_df['speaker'].value_counts()
common_speakers = speaker_counts[speaker_counts >= 5].index # Keep speakers with 5+ quotes
train_df['speaker'] = train_df['speaker'].apply(lambda x: x if x in common_speakers else 'other')
test_df['speaker'] = test_df['speaker'].apply(lambda x: x if x in common_speakers else 'other')


# map 6 classes to real/fake
def map_to_binary(label):
    if label in ['true', 'mostly-true', 'half-true']:
        return 'real'
    else:
        return 'fake'

In [5]:
# create target variables
y_train_binary = train_df['label'].apply(map_to_binary)
y_test_binary = test_df['label'].apply(map_to_binary)

# get features
text_feature = 'statement'
categorical_features = ['subject', 'speaker', 'party_affiliation']
numerical_features = ['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']

X_train = train_df.drop('label', axis=1)
X_test = test_df.drop('label', axis=1)

X_train[numerical_features] = X_train[numerical_features].astype(float)
X_test[numerical_features] = X_test[numerical_features].astype(float)

In [6]:
# use TF-IDF
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        min_df=5,                # try 1,2,3 etc in grid
        max_df=0.9,              # drop very common words
        ngram_range=(1, 2),
        stop_words='english',
        sublinear_tf=True
    ))
])

# use one hot for categorical variables
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# scale numerical values
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# combine
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('cat', cat_transformer, categorical_features),
        ('num', num_transformer, numerical_features)
    ])

# full pipeline
model_binary = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(C=0.01, random_state=31, max_iter=10000))
])

# train + predict
model_binary.fit(X_train, y_train_binary)
predictions_binary = model_binary.predict(X_test)

In [7]:
# print results
print("\nResults")
print(f"Accuracy: {accuracy_score(y_test_binary, predictions_binary):.2%}")
print("\nClassification Report")
print(classification_report(y_test_binary, predictions_binary))


Results
Accuracy: 64.40%

Classification Report
              precision    recall  f1-score   support

        fake       0.67      0.36      0.47       553
        real       0.64      0.86      0.73       714

    accuracy                           0.64      1267
   macro avg       0.65      0.61      0.60      1267
weighted avg       0.65      0.64      0.62      1267



In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

# Create text transformer with CountVectorizer
text_transformer_cv = Pipeline(steps=[
    ('count_vec', CountVectorizer(stop_words='english'))
])

# Create preprocessor reusing existing cat and num transformers
preprocessor_cv = ColumnTransformer(
    transformers=[
        ('text', text_transformer_cv, text_feature),
        ('cat', cat_transformer, categorical_features),
        ('num', num_transformer, numerical_features)
    ])

# Define the pipeline
model_cv_svd = Pipeline(steps=[
    ('preprocessor', preprocessor_cv),
    ('svd', TruncatedSVD(n_components=200, random_state=31)),
    ('classifier', LinearSVC(C=0.01, random_state=31, max_iter=10000))
])

# Define parameter grid
param_grid = {
    'svd__n_components': [ 350, 400],
    'classifier__C': [0.001, 0.005,0.0075, .001],
    'preprocessor__text__count_vec__ngram_range': [(1, 1), (1, 2)]
}

# Instantiate and fit GridSearchCV
grid_search = GridSearchCV(model_cv_svd, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train_binary)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [9]:
# Get best parameters and score
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV Score: {grid_search.best_score_:.4f}")

# Predict using best estimator
best_model = grid_search.best_estimator_
predictions_best = best_model.predict(X_test)

# Print results
print("\nTest Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test_binary, predictions_best):.2%}")
print("\nClassification Report")
print(classification_report(y_test_binary, predictions_best))

Best Parameters: {'classifier__C': 0.005, 'preprocessor__text__count_vec__ngram_range': (1, 1), 'svd__n_components': 400}
Best CV Score: 0.6325

Test Set Evaluation
Accuracy: 64.40%

Classification Report
              precision    recall  f1-score   support

        fake       0.64      0.42      0.51       553
        real       0.65      0.82      0.72       714

    accuracy                           0.64      1267
   macro avg       0.64      0.62      0.61      1267
weighted avg       0.64      0.64      0.63      1267

