In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

## Liar-Liar-Plus model
This notebook is a hackathon on the Liar-Liar-Plus dataset. It combines some factors like credibility and political affiliation. It also did distillation by using pre-trained sentiment-scoring model to score the justification text and used as another factor in the model. The final model also included a Bert text-embedding of the statements which got truncated to the first 50 to lower its weight. The output contains a GridSearchCV and the best accuracy model and its parameters. 

In [2]:
tsv_file_path = '../data/LIAR-PLUS/dataset/tsv/train2.tsv'
column_names = ["index", "ID", "label", "statement", "subject", 'speaker', 'speaker_job_title', 'state_info', 'party', 'barely_true_count', 'false_count', 'half_true_count', 'mostly_true_count', 'pants_on_fire_count', 'context', 'justification']
train_df = pd.read_csv(tsv_file_path, sep='\t', header=None, names=column_names)

In [3]:
tsv_file_path = '../data/LIAR-PLUS/dataset/tsv/val2.tsv'
column_names = ["index", "ID", "label", "statement", "subject", 'speaker', 'speaker_job_title', 'state_info', 'party', 'barely_true_count', 'false_count', 'half_true_count', 'mostly_true_count', 'pants_on_fire_count', 'context', 'justification'] 
val_df = pd.read_csv(tsv_file_path, sep='\t', header=None, names=column_names)

In [4]:
tsv_file_path = '../data/LIAR-PLUS/dataset/tsv/test2.tsv'
column_names = ["index", "ID", "label", "statement", "subject", 'speaker', 'speaker_job_title', 'state_info', 'party', 'barely_true_count', 'false_count', 'half_true_count', 'mostly_true_count', 'pants_on_fire_count', 'context', 'justification'] 
test_df = pd.read_csv(tsv_file_path, sep='\t', header=None, names=column_names)

In [5]:
train_df.shape, val_df.shape, test_df.shape

((10242, 16), (1284, 16), (1267, 16))

In [6]:
def format_name(name):
    if isinstance(name, str):
        parts = name.split('-')
        formatted_name = ' '.join(part.capitalize() for part in parts)
        return formatted_name
    else:
        return None

train_df['speaker'] = train_df['speaker'].apply(format_name)

In [7]:
cred_df = pd.read_csv('../data/credibility.csv').drop(columns = 'Unnamed: 0').drop_duplicates(subset='source')
merged_df = pd.merge(train_df, cred_df, left_on='speaker', right_on='source', how='left').drop_duplicates()
# Impute credibility score with historical data in this dataset, no true count
standard = ['mostly_true_count','half_true_count', 'barely_true_count','false_count','pants_on_fire_count']
score = np.array([8, 6, 4, 2, 0])
proportions = merged_df[standard].div(merged_df[standard].sum(axis=1), axis=0)
weighted_values = proportions * score
merged_df['credibility_score'] = merged_df['credibility_score'].fillna(weighted_values.sum(axis=1))
merged_df['party'] = merged_df['party'].replace(np.nan, 'none')
merged_df['statement'] = merged_df['statement'].replace(np.nan, 'none')
merged_df = merged_df.dropna(subset=['label'])

In [8]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize
import re
import math
import torch
from IPython.display import clear_output
def sentimentize_just(merged_df):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    distilled_student_sentiment_classifier = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        return_all_scores=True,
        device=device
    )

    scale = {
        'positive': 1,
        'neutral': 0,
        'negative': -1
    }

    def sentiment_score(result):
        numerical_scores = [scale[sentiment['label']] * sentiment['score'] for sentiment in result[0]]
        overall_score = sum(numerical_scores)
        return overall_score

    def sentiment_shift(article):
        cleaned_text = re.sub(r'\\', '', article)
        cleaned_text = re.sub(r'\n', ' ', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
        cleaned_text = cleaned_text.strip()
        cleaned_text = re.sub(r'“|”', '"', cleaned_text)

        data = []
        sentences = sent_tokenize(article)
        for sentence in sentences:
            # For now, trim sentence if longer than 512
            if len(sentence) > 512:
                sentence = sentence[:512]
            result = sentiment_score(distilled_student_sentiment_classifier(sentence))
            data.append(result)
        return data

    result = []
    for i in range(merged_df.shape[0]):
        just = merged_df.iloc[i]['justification']
        if isinstance(just, float) and math.isnan(just):
            result.append([0])
            continue
        result.append(sentiment_shift(just))
        if i % 100 == 0:
            print(f'Iteration {i} is done')
            clear_output(wait=True)
    just_sent = [np.mean(lst) for lst in result]
    return just_sent

2023-12-11 20:41:34.373444: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
    tokens = nltk.word_tokenize(text)    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [10]:
merged_df['sentiment_just'] = sentimentize_just(merged_df)
merged_df['statement'] = merged_df['statement'].apply(preprocess_text)

Iteration 10200 is done


In [11]:
party_mapping = {
    'republican': 0,
    'democrat': 1,
}

# Define a function to apply the mapping
def map_party(label):
    return party_mapping.get(label, 2)  # If label is not in the mapping, return the original label
X_train = merged_df[['statement', 'party', 'sentiment_just', 'credibility_score']]
y_train = merged_df['label']
X_train['party'] = X_train['party'].apply(map_party)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['party'] = X_train['party'].apply(map_party)


In [12]:
# Transforming validation set in the same way as training data
def transform_val(test_df):
    test_df['speaker'] = test_df['speaker'].apply(format_name)
    cred_df = pd.read_csv('../data/credibility.csv').drop(columns = 'Unnamed: 0').drop_duplicates(subset='source')
    merged_df = pd.merge(test_df, cred_df, left_on='speaker', right_on='source', how='left').drop_duplicates()
    # Impute credibility score with historical data in this dataset, no true count
    standard = ['mostly_true_count','half_true_count', 'barely_true_count','false_count','pants_on_fire_count']
    score = np.array([8, 6, 4, 2, 0])
    proportions = merged_df[standard].div(merged_df[standard].sum(axis=1), axis=0)
    weighted_values = proportions * score
    merged_df['credibility_score'] = merged_df['credibility_score'].fillna(weighted_values.sum(axis=1))
    merged_df['party'] = merged_df['party'].replace(np.nan, 'none')
    merged_df['statement'] = merged_df['statement'].replace(np.nan, 'none')
    merged_df = merged_df.dropna(subset=['label'])
    merged_df['statement'] = merged_df['statement'].apply(preprocess_text)
    merged_df['sentiment_just'] = sentimentize_just(merged_df)
    return merged_df
merged_val = transform_val(val_df)
X_val = merged_val[['statement', 'party', 'sentiment_just', 'credibility_score']]
X_val['party'] = X_val['party'].apply(map_party)
y_val = merged_val['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val['party'] = X_val['party'].apply(map_party)


In [14]:
# Transforming test set in the same way as training data
def transform_test(test_df):
    test_df['speaker'] = test_df['speaker'].apply(format_name)
    cred_df = pd.read_csv('../data/credibility.csv').drop(columns = 'Unnamed: 0').drop_duplicates(subset='source')
    merged_df = pd.merge(test_df, cred_df, left_on='speaker', right_on='source', how='left').drop_duplicates()
    # Impute credibility score with historical data in this dataset, no true count
    standard = ['mostly_true_count','half_true_count', 'barely_true_count','false_count','pants_on_fire_count']
    score = np.array([8, 6, 4, 2, 0])
    proportions = merged_df[standard].div(merged_df[standard].sum(axis=1), axis=0)
    weighted_values = proportions * score
    merged_df['credibility_score'] = merged_df['credibility_score'].fillna(weighted_values.sum(axis=1))
    merged_df['party'] = merged_df['party'].replace(np.nan, 'none')
    merged_df['statement'] = merged_df['statement'].replace(np.nan, 'none')
    merged_df = merged_df.dropna(subset=['label'])
    merged_df['statement'] = merged_df['statement'].apply(preprocess_text)
    merged_df['sentiment_just'] = sentimentize_just(merged_df)
    return merged_df
merged_test = transform_test(test_df)
X_test = merged_test[['statement', 'party', 'sentiment_just', 'credibility_score']]
X_test['party'] = X_test['party'].apply(map_party)
y_test = merged_test['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['party'] = X_test['party'].apply(map_party)


In [15]:
# Store the data to save time and GPU
merged_df.to_csv('../data/processsed_liar_train.csv', index = None)
merged_test.to_csv('../data/processsed_liar_test.csv', index = None)
merged_val.to_csv('../data/processsed_liar_val.csv', index = None)

In [16]:
party_mapping = {
    'republican': 0,
    'democrat': 1,
}

# Define a function to apply the mapping
def map_party(label):
    return party_mapping.get(label, 2)

# Train set
merged_df = pd.read_csv('../data/processsed_liar_train.csv')
X_train = merged_df[['statement', 'party', 'sentiment_just', 'credibility_score']]
y_train = merged_df['label']
X_train['party'] = X_train['party'].apply(map_party)
X_train = X_train.copy()  # Create a copy to avoid SettingWithCopyWarning

# Validation set
merged_val = pd.read_csv('../data/processsed_liar_val.csv')
X_val = merged_val[['statement', 'party', 'sentiment_just', 'credibility_score']]
X_val['party'] = X_val['party'].apply(map_party)
X_val = X_val.copy()  # Create a copy to avoid SettingWithCopyWarning
y_val = merged_val['label']

# Test set
merged_test = pd.read_csv('../data/processsed_liar_test.csv')
X_test = merged_test[['statement', 'party', 'sentiment_just', 'credibility_score']]
X_test['party'] = X_test['party'].apply(map_party)
X_test = X_test.copy()  # Create a copy to avoid SettingWithCopyWarning
y_test = merged_test['label']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['party'] = X_train['party'].apply(map_party)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val['party'] = X_val['party'].apply(map_party)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['party'] = X_test['party'].apply(map_party)


In [17]:
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
import torch

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(data):
    tokens = tokenizer(data['statement'].tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings

# Can change to 128 or other batch size
batch_size = 64
num_samples = len(X_train)
num_batches = (num_samples + batch_size - 1) // batch_size

X_train_embeddings_list = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = X_train.iloc[start_idx:end_idx]
    batch_embeddings = get_bert_embeddings(batch_data)
    X_train_embeddings_list.append(batch_embeddings)

X_train_embeddings = torch.cat(X_train_embeddings_list, dim=0).cpu().numpy()

# Preprocess val set
def process_val_data(data):
    tokens = tokenizer(data['statement'].tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings

num_samples_val = len(X_val)
num_batches_val = (num_samples_val + batch_size - 1) // batch_size

X_val_embeddings_list = []

for i in range(num_batches_val):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = X_val.iloc[start_idx:end_idx]
    batch_embeddings = process_val_data(batch_data)
    X_val_embeddings_list.append(batch_embeddings)

X_val_embeddings = torch.cat(X_val_embeddings_list, dim=0).cpu().numpy()

# Preprocess test set
def process_test_data(data):
    tokens = tokenizer(data['statement'].tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings

num_samples_test = len(X_test)
num_batches_test = (num_samples_test + batch_size - 1) // batch_size

X_test_embeddings_list = []

for i in range(num_batches_test):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = X_test.iloc[start_idx:end_idx]
    batch_embeddings = process_test_data(batch_data)
    X_test_embeddings_list.append(batch_embeddings)

X_test_embeddings = torch.cat(X_test_embeddings_list, dim=0).cpu().numpy()

X_train_combined = np.concatenate((X_train_embeddings[:, :50], X_train[['party', 'sentiment_just', 'credibility_score']].values), axis=1)
X_val_combined = np.concatenate((X_val_embeddings[:, :50], X_val[['party', 'sentiment_just', 'credibility_score']].values), axis=1)
X_test_combined = np.concatenate((X_test_embeddings[:, :50], X_test[['party', 'sentiment_just', 'credibility_score']].values), axis=1)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifiers = {
    'RandomForest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
    'GradientBoosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
    'SVM': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']})
}
best_model = None
best_accuracy = 0
for name, (classifier, param_grid) in classifiers.items():
    clf = GridSearchCV(classifier, param_grid, cv=3, scoring='accuracy')
    clf.fit(X_train_combined, y_train)
    y_pred = clf.predict(X_val_combined)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'{name} with Best Parameters: {clf.best_params_},and Accuracy: {accuracy} on validation')
    
    y_pred_test = clf.predict(X_test_combined)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    
    print(f'{name} with Accuracy: {accuracy_test} on test')
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = clf.best_estimator_
y_pred_best = best_model.predict(X_val_combined)
accuracy_best = accuracy_score(y_val, y_pred_best)
print(f'Best Model - Accuracy: {accuracy_best}')

RandomForest with Best Parameters: {'n_estimators': 200},and Accuracy: 0.3909657320872274 on validation
RandomForest with Accuracy: 0.36937647987371747 on test
GradientBoosting with Best Parameters: {'n_estimators': 100},and Accuracy: 0.39953271028037385 on validation
GradientBoosting with Accuracy: 0.3709550118389897 on test
SVM with Best Parameters: {'C': 10, 'kernel': 'rbf'},and Accuracy: 0.39797507788161995 on validation
SVM with Accuracy: 0.3756906077348066 on test
Best Model - Accuracy: 0.39953271028037385


In [19]:
y_pred_test = best_model.predict(X_test_combined)
accuracy_best = accuracy_score(y_test, y_pred_test)
accuracy_best

0.3709550118389897