In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [3]:
# Load datasets
column_names = [
    'id', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
    'state', 'party', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]

In [6]:
# Read datasets
train_df = pd.read_csv('liar_dataset (2)/train.tsv', sep='\t', names=column_names)
valid_df = pd.read_csv('liar_dataset (2)/valid.tsv', sep='\t', names=column_names)
test_df = pd.read_csv('liar_dataset (2)/test.tsv', sep='\t', names=column_names)

# Convert labels to binary
def binarize_label(label):
    return 1 if label in ['true', 'mostly-true'] else 0

train_df['binary_label'] = train_df['label'].apply(binarize_label)
valid_df['binary_label'] = valid_df['label'].apply(binarize_label)
test_df['binary_label'] = test_df['label'].apply(binarize_label)

# Extract features and labels
X_train = train_df['statement']
y_train = train_df['binary_label']
X_valid = valid_df['statement']
y_valid = valid_df['binary_label']
X_test = test_df['statement']
y_test = test_df['binary_label']

In [13]:
train_df.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,binary_label
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,0
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,1
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,0
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,0


In [23]:
# Create TF-IDF vectorizer - converts text to numbers
# Example: For "Tax cuts created jobs" → creates a vocabulary and assigns weights to words
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform training data - learns vocabulary and converts training claims to vectors
# Example: "Tax cuts created jobs" → [0.4, 0.3, 0.5, 0.6, 0, 0, ...] (in 5000-dimensional space)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform validation and test data - uses existing vocabulary to convert new claims
# Example: For new claim "Jobs grew last year" → [0, 0, 0, 0.7, 0, 0, ...] (same 5000 dimensions)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train logistic regression model - learns weights for each word to predict truthfulness
# Example: Learns that words like "unemployment" with "lowest" may indicate truth
baseline_model = LogisticRegression(max_iter=1000, class_weight='balanced')
baseline_model.fit(X_train_tfidf, y_train)

# Make predictions on validation set - applies learned weights to new claims
# Example: For vector [0, 0, 0, 0.7, 0, 0, ...] → predicts "True" (1) or "False" (0)
y_valid_pred = baseline_model.predict(X_valid_tfidf)
y_valid_prob = baseline_model.predict_proba(X_valid_tfidf)[:, 1]  # Probability of being true (class 1)

# Calculate validation metrics - measures how well the model performed
# Example: If prediction was 1 but actual was 0 → contributes to lower accuracy
valid_accuracy = accuracy_score(y_valid, y_valid_pred)  # Percentage of correct predictions
valid_precision, valid_recall, valid_f1, _ = precision_recall_fscore_support(
   y_valid, y_valid_pred, average='binary'  # Gets metrics for the "True" (1) class
)

print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation Precision: {valid_precision:.4f}")
print(f"Validation Recall: {valid_recall:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print("\nValidation Classification Report:")
print(classification_report(y_valid, y_valid_pred))

Validation Accuracy: 0.6192
Validation Precision: 0.4340
Validation Recall: 0.5405
Validation F1 Score: 0.4814

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       864
           1       0.43      0.54      0.48       420

    accuracy                           0.62      1284
   macro avg       0.59      0.60      0.59      1284
weighted avg       0.64      0.62      0.63      1284



## Baseline Model Analysis (Logistic Regression with TF-IDF)

The baseline model achieves the following performance metrics:

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| False (0) | 75% | 66% | 70% | 864 |
| True (1) | 43% | 54% | 48% | 420 |
| **Overall** | 64% | 62% | 63% | 1284 |

### Interpretation:

- **Accuracy (62%)**: The model correctly classifies 62% of all claims.

- **For False Claims**:
  - **Precision (75%)**: When the model predicts a claim is false, it's correct 75% of the time.
  - **Recall (66%)**: The model successfully identifies 66% of all false claims.
  - **F1-Score (70%)**: The harmonic mean of precision and recall, providing a balanced measure of the model's performance on false claims.

- **For True Claims**:
  - **Precision (43%)**: When the model predicts a claim is true, it's correct only 43% of the time.
  - **Recall (54%)**: The model identifies 54% of all true claims.
  - **F1-Score (48%)**: The harmonic mean of precision and recall, showing moderate performance on true claims.

### Key Observations:

1. The model performs better on false claims than true claims, likely due to class imbalance (more false claims in the dataset).

2. The lower precision for true claims indicates a higher rate of false positives - the model tends to incorrectly classify false claims as true.

3. The F1-score provides a single metric that balances precision and recall. The overall F1-score of 63% will serve as our primary comparison metric for more sophisticated models.

This baseline establishes a minimum performance threshold that our deep learning models should exceed to demonstrate effectiveness.

In [24]:
# Evaluate on test set
y_test_pred = baseline_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, y_test_pred, average='binary'
)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.5825
Test Precision: 0.4315
Test Recall: 0.5612
Test F1 Score: 0.4879

Test Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.59      0.65       818
           1       0.43      0.56      0.49       449

    accuracy                           0.58      1267
   macro avg       0.57      0.58      0.57      1267
weighted avg       0.61      0.58      0.59      1267



## Baseline Model Test Results (Logistic Regression with TF-IDF)

Our baseline logistic regression model with TF-IDF features achieved the following performance metrics on the test set:

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| False (0) | 71% | 59% | 65% | 818 |
| True (1) | 43% | 56% | 49% | 449 |
| **Overall** | 61% | 58% | 59% | 1267 |

### Key Insights:

- **Accuracy: 58.25%** - The model correctly classifies just over half of all claims.

- **False Claims Performance:**
  - When the model predicts a claim is false, it's right 71% of the time.
  - The model identifies 59% of all false claims in the dataset.
  - F1-score of 65% indicates reasonable but not excellent performance.

- **True Claims Performance:**
  - When the model predicts a claim is true, it's right only 43% of the time.
  - The model identifies 56% of all true claims in the dataset.
  - F1-score of 49% shows the model struggles more with identifying true claims accurately.

- **Class Imbalance Effect:**
  - The test set has 818 false claims and 449 true claims (a 1.8:1 ratio).
  - This imbalance likely contributes to the model's weaker performance on true claims.

### Benchmark Targets for Advanced Models:

Our deep learning approaches should aim to exceed:
- **Overall Accuracy: > 58.25%**
- **F1-Score for True Claims: > 49%**
- **F1-Score for False Claims: > 65%**
- **Overall F1-Score: > 59%**
- **Precision: > 61%**

These metrics establish the minimum performance thresholds that more sophisticated models must surpass to demonstrate meaningful improvement.