In [40]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict

column_names = [
    'id',
    'label',
    'statement',
    'subject',
    'speaker',
    'job_title',
    'state_info',
    'party_affiliation',
    'barely_true_counts',
    'false_counts',
    'half_true_counts',
    'mostly_true_counts',
    'pants_on_fire_counts',
    'context'
]

#Load the datasets
train_df = pd.read_csv('data/raw/train.tsv', sep='\t', names=column_names)
valid_df = pd.read_csv('data/raw/valid.tsv', sep='\t', names=column_names)
test_df = pd.read_csv('data/raw/test.tsv', sep='\t', names=column_names)

print(f"Training set shape: {train_df.shape}")


Training set shape: (10240, 14)


In [41]:
def create_binary_labels(df):
    #define mapping
    label_map = {
        'true': 1,
        'mostly-true': 1,
        'false' : 0,
        'pants-fire': 0,
        'barely-true': 0,
    }

    #create a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()
    #apply mapping
    df_copy['label_binary'] = df_copy['label'].map(label_map)
    # drop rows where label is not in map
    df_copy.dropna(subset=['label_binary'], inplace=True)
    #convert new label column to int
    df_copy['label_binary'] = df_copy['label_binary'].astype(int)
    df_copy['statement'].fillna('', inplace=True) #handle potential missing statements
    return df_copy

# apply the function to each dataframe
train_binary_df = create_binary_labels(train_df)
valid_binary_df = create_binary_labels(valid_df)
test_binary_df = create_binary_labels(test_df)

print("-- Data shapes after converting to binary labels --")
print(f"Training set shape: {train_binary_df.shape}")
print(f"Validation set shape: {valid_binary_df.shape}")
print(f"Test set shape: {test_binary_df.shape}")    

print("\n-- Label distribution in training set --")
print(train_binary_df['label_binary'].value_counts())

-- Data shapes after converting to binary labels --
Training set shape: (8126, 15)
Validation set shape: (1036, 15)
Test set shape: (1002, 15)

-- Label distribution in training set --
label_binary
0    4488
1    3638
Name: count, dtype: int64


In [42]:
# # define the feature and target variables
# x_train = train_binary_df['statement']
# y_train = train_binary_df['label_binary']

# x_valid = valid_binary_df['statement']
# y_valid = valid_binary_df['label_binary']

# #create pipeline
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
#     ('clf', LogisticRegression(solver='liblinear', random_state=42))
# ])

# #rain the model
# print("\n-- Training the model --")
# pipeline.fit(x_train, y_train)
# print("Model training completed.")

# #make predictions on validation set
# y_pred = pipeline.predict(x_valid)

# #print classification report
# print("\n-- Classification Report on Validation Set --")
# print(classification_report(y_valid, y_pred, target_names=['False', 'True']))

# define the feature and target variables for the full training set
x = train_binary_df['statement']
y = train_binary_df['label_binary']

# create the pipeline(TF-IDF + Logistic Regression)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

# define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# define the scoring metrics we want to collect
scoring = {
    'accuracy': 'accuracy',
    'precision_false:': make_scorer(precision_score, pos_label=0),
    'recall_false': make_scorer(recall_score, pos_label=0),
    'f1_false': make_scorer(f1_score, pos_label=0),
    'precision_true' : make_scorer(precision_score, pos_label=1),
    'recall_true': make_scorer(recall_score, pos_label=1),
    'f1_true': make_scorer(f1_score, pos_label=1)
}

# perform cross-validation
print("Performing 5-fold cross-validation...")
cv_results = cross_validate(pipeline, x, y, cv=cv, scoring=scoring, return_train_score=False)
print("Cross-validation completed.")

# print the mean and standard deviation of the results
print("\n--- Cross-Validation Results (Mean +/- Std Dev) ---")
for metric_name, scores in cv_results.items():
    if metric_name.startswith('test_') : # only interested in test scores
        print(f"{metric_name[5:]}: {scores.mean():.3f} +/- {scores.std():.3f}")

# to get per-class precision, recall, f1, we need to do more work
# cross_validate w/ 'precision_score' etc. by default calcualtes for the positive class (1)
# we need to specify pos_label for each class or calculate manually
# for simplicity, let's get the overall classification repot from cross_val_predict

print("\n--- Aggregated Classfication Report from Cross-Validation ---")
y_pred_cv = cross_val_predict(pipeline, x, y, cv=cv)
print(classification_report(y, y_pred_cv, target_names=['False','True']))


Performing 5-fold cross-validation...
Cross-validation completed.

--- Cross-Validation Results (Mean +/- Std Dev) ---
accuracy: 0.625 +/- 0.007
precision_false:: 0.638 +/- 0.006
recall_false: 0.742 +/- 0.011
f1_false: 0.686 +/- 0.006
precision_true: 0.602 +/- 0.010
recall_true: 0.481 +/- 0.014
f1_true: 0.535 +/- 0.010

--- Aggregated Classfication Report from Cross-Validation ---
              precision    recall  f1-score   support

       False       0.64      0.74      0.69      4488
        True       0.60      0.48      0.53      3638

    accuracy                           0.63      8126
   macro avg       0.62      0.61      0.61      8126
weighted avg       0.62      0.63      0.62      8126



In [43]:
# Improvement 2: Multionomial Naive Bayes Classifier
print("---- Training and Evaluation Naive Bayes Model ---")

# create the Naieve Bayes pipleine
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('clf', MultinomialNB())
])

# We can reuse the same data (x,y) and cross-validation strategy (cv) from the previous cells

# perform cross-validation for Naive Bayes
print("\nPerforming 5-fold cross-validation for Naive Bayes...")
nb_cv_results = cross_validate(nb_pipeline, x, y, cv=cv, scoring=scoring, return_train_score=False)
print("Cross-validation for Naive Bayes completed.")    

# print the mean and standard deviation of the results for Naive Bayes
print("\n--- Naive Bayes Cross-Validation Results (Mean +/- Std Dev) ---")
for metric_name, scores in nb_cv_results.items():
    if metric_name.startswith('test_'): #only interested in test scores
        print(f"{metric_name[5:]}: {scores.mean():.3f} +/- {scores.std():.3f}")

# get aggregated classification report for Naive Bayes
print("\n--- Naive Bayes Aggregated Classfication Report from Cross-Validation ---")
y_pred_nb_cv = cross_val_predict(nb_pipeline, x, y, cv=cv)
print(classification_report(y, y_pred_nb_cv, target_names=['False','True']))

---- Training and Evaluation Naive Bayes Model ---

Performing 5-fold cross-validation for Naive Bayes...
Cross-validation for Naive Bayes completed.

--- Naive Bayes Cross-Validation Results (Mean +/- Std Dev) ---
accuracy: 0.614 +/- 0.008
precision_false:: 0.608 +/- 0.005
recall_false: 0.845 +/- 0.011
f1_false: 0.707 +/- 0.007
precision_true: 0.632 +/- 0.019
recall_true: 0.328 +/- 0.008
f1_true: 0.432 +/- 0.010

--- Naive Bayes Aggregated Classfication Report from Cross-Validation ---
              precision    recall  f1-score   support

       False       0.61      0.85      0.71      4488
        True       0.63      0.33      0.43      3638

    accuracy                           0.61      8126
   macro avg       0.62      0.59      0.57      8126
weighted avg       0.62      0.61      0.58      8126



In [44]:
# --- improvement 3: Feature Analysis for Logistic Regression ---
print("--- Analyzing Most Signficant Features for Logistic Regression ---")

# first, we need to retrain the pipeline on the full training data to get a single model to inspect.
# (x and y are already defined from the previous cells)
print("\nRetraining the pipeline on the full training data...")
pipeline.fit(x,y)
print("Retraining completed.")

# extract the trained vectorizer and classifier from the pipeline
vectorizer = pipeline.named_steps['tfidf']
classifier = pipeline.named_steps['clf']

# get the feature names (the words and bigrams)
feature_names = vectorizer.get_feature_names_out()

# get the coefficients from the logistic regression model
# the classifier has one coefficient per feature 
coefficients = classifier.coef_[0]

# create a dataframe to view the features and their coefficients
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# sort the DataFrame to find the top features
# most negative coefficients indicate strong association with class 0 (False)
# most positive coefficients indicate strong association with class 1 (True)
sorted_coef_df = coef_df.sort_values(by='coefficient', ascending=True)

# Print the top 15 features for each class
print("\n--- Top 15 Features Indicative of 'False' Class ---")
print(sorted_coef_df.head(15))

print("\n--- Top 15 Features Indicative of 'True' Class ---")
print(sorted_coef_df.tail(15))


--- Analyzing Most Signficant Features for Logistic Regression ---

Retraining the pipeline on the full training data...
Retraining completed.

--- Top 15 Features Indicative of 'False' Class ---
            feature  coefficient
39676     obamacare    -2.383089
39371         obama    -2.068235
51558          says    -1.674549
43706          plan    -1.568515
64783     wisconsin    -1.507090
48750           rep    -1.487791
36334      medicare    -1.409577
63504        walker    -1.368859
39794        obamas    -1.358984
52698  scott walker    -1.273747
56597      stimulus    -1.198985
46957         raise    -1.147613
12324       clinton    -1.138387
10367          care    -1.136506
64538         white    -1.112952

--- Top 15 Features Indicative of 'True' Class ---
         feature  coefficient
2371          60     1.299957
36518    members     1.337882
4999    american     1.392508
16609       debt     1.457837
37742     months     1.504054
14773    country     1.521157
27535    highe