In [31]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib as mpl
import os
import sys
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Datasets

In [19]:

train_path = r"datasets\test.tsv"
test_path = r"datasets\train.tsv"
valid_path = r"datasets\valid.tsv"

In [20]:
train_data = pd.read_csv(train_path, sep='\t', header=None)
test_data = pd.read_csv(test_path, sep='\t', header=None)
valid_data = pd.read_csv(valid_path, sep='\t', header=None)


In [21]:
columns = [
    "id", "label", "statement", "subject", "speaker", 
    "job_title", "state_info", "party_affiliation", 
    "barely_true_counts", "false_counts", "half_true_counts", 
    "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_data.columns = columns
test_data.columns = columns
valid_data.columns = columns

In [22]:
train_data = train_data.dropna(subset=['statement'])
test_data = test_data.dropna(subset=['statement'])
valid_data = valid_data.dropna(subset=['statement'])

In [23]:
print("\nTrain Data Summary Statistics:")
train_data.describe()

print("\nTest Data Summary Statistics:")
test_data.describe()

print("\nValidation Data Summary Statistics:")
valid_data.describe()


print("\nTrain Data Info:")
train_data.info()

print("\nTest Data Info:")
test_data.info()

print("\nValidation Data Info:")
valid_data.info()


Train Data Summary Statistics:

Test Data Summary Statistics:

Validation Data Summary Statistics:

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267 entries, 0 to 1266
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    1267 non-null   object
 1   label                 1267 non-null   object
 2   statement             1267 non-null   object
 3   subject               1267 non-null   object
 4   speaker               1267 non-null   object
 5   job_title             942 non-null    object
 6   state_info            1005 non-null   object
 7   party_affiliation     1267 non-null   object
 8   barely_true_counts    1267 non-null   int64 
 9   false_counts          1267 non-null   int64 
 10  half_true_counts      1267 non-null   int64 
 11  mostly_true_counts    1267 non-null   int64 
 12  pants_on_fire_counts  1267 non-null   int64 
 13  context             

In [24]:
print(train_data['label'].value_counts())

label
half-true      265
false          249
mostly-true    241
barely-true    212
true           208
pants-fire      92
Name: count, dtype: int64


In [33]:
classes = np.array(["half-true", "false", "mostly-true", "barely-true", "true", "pants-fire"])

# compute class weights because the above classes are imbalanced according to the value counts from the above cell output 
weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_data['label']
)

# weights = weights / np.sum(weights) # normalized the weights to sum to 1 
print(dict(zip(classes, weights)))


{'half-true': 0.7968553459119497, 'false': 0.8480589022757697, 'mostly-true': 0.876210235131397, 'barely-true': 0.9960691823899371, 'true': 1.015224358974359, 'pants-fire': 2.295289855072464}


# text preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def preprocess_text(text):
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words and word.isalnum()]
    return " ".join(filtered_text)


train_data['statement'] = train_data['statement'].apply(preprocess_text)
test_data['statement'] = test_data['statement'].apply(preprocess_text)
valid_data['statement'] = valid_data['statement'].apply(preprocess_text)


# tf-idf vectorization

vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_data['statement'])

X_train = vectorizer.transform(train_data['statement']).toarray()
X_test = vectorizer.transform(test_data['statement']).toarray()
X_valid = vectorizer.transform(valid_data['statement']).toarray()

y_train = train_data['label']
y_test = test_data['label']
y_valid = valid_data['label']

print(X_train.shape, X_test.shape, X_valid.shape)
print(y_train.shape, y_test.shape, y_valid.shape)

# save the preprocessed data


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\U765123\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U765123\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(1267, 4078) (10240, 4078) (1284, 4078)
(1267,) (10240,) (1284,)


# Model Training (Logistic Regression)

In [39]:
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



0.2119140625
              precision    recall  f1-score   support

 barely-true       0.20      0.20      0.20      1654
       false       0.23      0.21      0.22      1995
   half-true       0.23      0.22      0.22      2114
 mostly-true       0.23      0.24      0.23      1962
  pants-fire       0.14      0.14      0.14       839
        true       0.20      0.22      0.21      1676

    accuracy                           0.21     10240
   macro avg       0.20      0.20      0.20     10240
weighted avg       0.21      0.21      0.21     10240




- Precision is about how accurate the model's positive predictions are
- Recall is about how well the model identifies positive cases
- F1-Score balances precision and recall
- Support shows how many instances of each class exist in the data


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = train_data['statement']
y_train = train_data['label']
X_test = test_data['statement']
y_test = test_data['label']


vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


0.21572265625
              precision    recall  f1-score   support

 barely-true       0.20      0.15      0.18      1654
       false       0.23      0.28      0.25      1995
   half-true       0.21      0.32      0.25      2114
 mostly-true       0.23      0.26      0.24      1962
  pants-fire       0.67      0.00      0.00       839
        true       0.20      0.12      0.15      1676

    accuracy                           0.22     10240
   macro avg       0.29      0.19      0.18     10240
weighted avg       0.25      0.22      0.20     10240

[[256 413 540 327   0 118]
 [306 554 573 361   0 201]
 [250 480 681 498   1 204]
 [176 394 684 514   0 194]
 [138 222 275 126   2  76]
 [125 394 506 449   0 202]]
0.21572265625



- In the confusion matrix, the rows represent the actual classes and the columns represent the predicted classes.
- The diagonal elements represent the number of points for which the predicted label is equal to the true label, while off-diagonal elements are those that are mislabeled by the classifier.
- The higher the diagonal values of the confusion matrix the better, indicating many correct predictions.

The results are not very good, but this is expected since we are using a simple logistic regression model with a small dataset.

In [45]:
# Experimenting with different hyperparameters

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000, random_state=42)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

0.2201171875
              precision    recall  f1-score   support

 barely-true       0.20      0.14      0.17      1654
       false       0.23      0.28      0.25      1995
   half-true       0.22      0.37      0.27      2114
 mostly-true       0.23      0.26      0.24      1962
  pants-fire       0.00      0.00      0.00       839
        true       0.20      0.10      0.13      1676

    accuracy                           0.22     10240
   macro avg       0.18      0.19      0.18     10240
weighted avg       0.20      0.22      0.20     10240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
# the results are not better than the previous model, so we will stick with the previous model

# retraining the model with these new features
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

 barely-true       0.20      0.09      0.13      1654
       false       0.23      0.23      0.23      1995
   half-true       0.22      0.53      0.31      2114
 mostly-true       0.22      0.21      0.21      1962
  pants-fire       0.00      0.00      0.00       839
        true       0.22      0.05      0.09      1676

    accuracy                           0.22     10240
   macro avg       0.18      0.19      0.16     10240
weighted avg       0.20      0.22      0.18     10240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix


svm_classifier = SVC(kernel='linear', class_weight='balanced')
svm_classifier.fit(X_train_tfidf, y_train)


y_pred = svm_classifier.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

 barely-true       0.21      0.21      0.21      1654
       false       0.22      0.24      0.23      1995
   half-true       0.22      0.24      0.23      2114
 mostly-true       0.22      0.25      0.24      1962
  pants-fire       0.14      0.05      0.07       839
        true       0.21      0.18      0.19      1676

    accuracy                           0.21     10240
   macro avg       0.20      0.20      0.20     10240
weighted avg       0.21      0.21      0.21     10240

[[353 366 361 332  55 187]
 [399 487 409 363  66 271]
 [326 420 515 490  53 310]
 [264 372 485 500  41 300]
 [186 202 196 116  40  99]
 [193 356 361 441  21 304]]
