In [3]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# load and setup
column_names = [
    'id', 'label', 'statement', 'subject', 'speaker', 'job_title',
    'state_info', 'party_affiliation', 'barely_true_counts',
    'false_counts', 'half_true_counts', 'mostly_true_counts',
    'pants_on_fire_counts', 'context'
]

train_df = pd.read_csv('train.tsv', sep='\t', header=None, names=column_names)
test_df = pd.read_csv('test.tsv', sep='\t', header=None, names=column_names)
valid_df = pd.read_csv('valid.tsv', sep='\t', header=None, names=column_names)

# combine train and valid sets
full_train_df = pd.concat([train_df, valid_df], ignore_index=True)

# feature engineering

# use ratios of history instead of raw counts
def create_ratios(df):
    history_cols = ['barely_true_counts', 'false_counts', 'half_true_counts',
                    'mostly_true_counts', 'pants_on_fire_counts']

    # Ensure numeric
    for col in history_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Total statements
    df['total_history'] = df[history_cols].sum(axis=1)
    df['total_history'] = df['total_history'].replace(0, 1) # Avoid div/0

    # Calculate Ratios
    df['lie_ratio'] = (df['false_counts'] + df['pants_on_fire_counts']) / df['total_history']
    df['truth_ratio'] = (df['mostly_true_counts'] + df['half_true_counts']) / df['total_history']
    return df

full_train_df = create_ratios(full_train_df)
test_df = create_ratios(test_df)

# B. use TextBlob to anaylyze sentiment of articles
def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

def get_subjectivity(text):
    return TextBlob(str(text)).sentiment.subjectivity

full_train_df['sentiment'] = full_train_df['statement'].apply(get_sentiment)
full_train_df['subjectivity'] = full_train_df['statement'].apply(get_subjectivity)

test_df['sentiment'] = test_df['statement'].apply(get_sentiment)
test_df['subjectivity'] = test_df['statement'].apply(get_subjectivity)

# general preprocessing
def combine_text_cols(df):
    return (df['statement'].fillna('') + ' ' +
            df['context'].fillna('') + ' ' +
            df['job_title'].fillna(''))

full_train_df['full_text'] = combine_text_cols(full_train_df)
test_df['full_text'] = combine_text_cols(test_df)

# make all categories equal in side
def group_rare(df, col, top_items):
    return df[col].apply(lambda x: x if x in top_items else 'other')

for col in ['party_affiliation', 'subject']:
    # identify top 10 in the combined training set
    top_items = full_train_df[col].value_counts().nlargest(10).index

    # Apply to both
    full_train_df[col] = group_rare(full_train_df, col, top_items)
    test_df[col] = group_rare(test_df, col, top_items)

# map 6 targets to 2
def map_to_binary(label):
    if label in ['true', 'mostly-true', 'half-true']:
        return 1
    else:
        return 0

# Create X and y
y_train = full_train_df['label'].apply(map_to_binary)
y_test = test_df['label'].apply(map_to_binary)

X_train = full_train_df.drop('label', axis=1)
X_test = test_df.drop('label', axis=1)

# full pipeline construction
text_feature = 'full_text'
categorical_features = ['party_affiliation', 'subject']
numerical_features = [
    'barely_true_counts', 'false_counts', 'half_true_counts',
    'mostly_true_counts', 'pants_on_fire_counts',
    'total_history', 'lie_ratio', 'truth_ratio',
    'sentiment', 'subjectivity'
]

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        min_df=3,
        max_df=0.9,
        ngram_range=(1, 2),
        stop_words='english',
        sublinear_tf=True
    ))
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('cat', cat_transformer, categorical_features),
        ('num', num_transformer, numerical_features)
    ])

# hybrid SVM and XGBoost model
clf_linear = LogisticRegression(C=1.0, solver='liblinear', max_iter=5000)

clf_xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

ensemble_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting', VotingClassifier(
        estimators=[
            ('lr', clf_linear),
            ('xgb', clf_xgb)
        ],
        voting='soft'
    ))
])

# train/predict
print("Training Ensemble Model on Full Data (Train + Valid)...")
ensemble_model.fit(X_train, y_train)

print("Predicting on Test Set...")
predictions = ensemble_model.predict(X_test)

# results
print("\nResults")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2%}")
print("\nClassification Report")
print(classification_report(y_test, predictions, target_names=['Fake', 'Real']))

Loading datasets...
Combining Train and Validation sets...
Engineering features...
Training Ensemble Model on Full Data (Train + Valid)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Predicting on Test Set...

Results
Accuracy: 74.82%

Classification Report
              precision    recall  f1-score   support

        Fake       0.74      0.65      0.69       553
        Real       0.75      0.82      0.79       714

    accuracy                           0.75      1267
   macro avg       0.75      0.74      0.74      1267
weighted avg       0.75      0.75      0.75      1267

