### Setup

In [1]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import csv
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import (BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          XLMForSequenceClassification, XLMTokenizer,
                          XLNetForSequenceClassification, XLNetTokenizer,
                          AlbertForSequenceClassification, AlbertTokenizer,
                          AdamW, get_linear_schedule_with_warmup
                          )

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

def load_df(file_path:str, is_plus:bool):
    df = pd.read_csv(file_path, sep='\t', header=None, quoting=csv.QUOTE_NONE, usecols=[2,3,5,14,15]).dropna()
    df = df.rename(columns={2:'target', 3:'headline', 5:'speaker' , 14:'context', 15:'justification'})
    
    if(is_plus == False):
        df = df[['target', 'headline', 'speaker', 'context']]   
    df['target'] = df['target'].apply(encode_label)
    
    return df

def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\n', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]

    return ' '.join(words)

### Loading Data...

In [3]:
lp_train = load_df('../data/liar_plus/train2.tsv', is_plus=True)
lp_train.head(3)

Unnamed: 0,target,headline,speaker,context,justification
0,4,Says the Annies List political group supports ...,dwayne-bohac,a mailer,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,scott-surovell,a floor speech.,"""Surovell said the decline of coal """"started w..."
2,1,"""Hillary Clinton agrees with John McCain """"by ...",barack-obama,Denver,"""Obama said he would have voted against the am..."


In [4]:
pf = pd.read_csv('../data/politifact_plus.csv').drop_duplicates('article').rename(columns={'when/where':'context'})

pf['clean_headline'] = pf['headline'].apply(clean_text)
pf['clean_article'] = pf['article'].apply(clean_text)
pf['clean_summaries'] = pf['summaries'].apply(clean_text)
pf['clean_summaries'] = pf['clean_summaries'].replace('', np.nan)

pf['target'] = pf['target'].apply(encode_label)
pf = pf[pf['target'] != -1]

pf.head(3)

Unnamed: 0,source,context,headline,target,speaker,documented_time,author_score,summaries,article,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire,clean_headline,clean_article,clean_summaries
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,4,Madison Czopek,"October 31, 2023",[ 5 3 16 54 473 152],"['Haaretz, an Israeli newspaper, said on X tha...",A viral Oct. 28 social media post claimed that...,5.0,3.0,16.0,54.0,480.0,157.0,haaretz investig reveal discrep israel report ...,viral oct social medium post claim israel lie ...,haaretz isra newspap said x claim report blata...
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,2,Laura Schulte,"October 31, 2023",[26 45 39 41 44 11],['Although Wisconsin has voted for more Democr...,"In 2016, Wisconsin helped to swing the preside...",26.0,45.0,39.0,41.0,44.0,11.0,wisconsin histor think larg continu blue state,wisconsin help swing presidenti vote donald tr...,although wisconsin vote democrat presidenti ca...
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",4,Ciara O'Rourke,"October 30, 2023",[ 5 3 16 54 473 152],[],A social media post poised to encourage people...,5.0,3.0,16.0,54.0,480.0,157.0,airport salzburg austria counter peopl flew au...,social medium post poi encourag peopl unfortun...,


In [5]:
pf_clean = pf.drop(columns=['headline', 'article', 'summaries'])
pf_clean.head(3)

Unnamed: 0,source,context,target,speaker,documented_time,author_score,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire,clean_headline,clean_article,clean_summaries
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",4,Madison Czopek,"October 31, 2023",[ 5 3 16 54 473 152],5.0,3.0,16.0,54.0,480.0,157.0,haaretz investig reveal discrep israel report ...,viral oct social medium post claim israel lie ...,haaretz isra newspap said x claim report blata...
1,Scott Walker,"stated on May 30, 2023 in Interview:",2,Laura Schulte,"October 31, 2023",[26 45 39 41 44 11],26.0,45.0,39.0,41.0,44.0,11.0,wisconsin histor think larg continu blue state,wisconsin help swing presidenti vote donald tr...,although wisconsin vote democrat presidenti ca...
2,Instagram posts,"stated on October 27, 2023 in a post:",4,Ciara O'Rourke,"October 30, 2023",[ 5 3 16 54 473 152],5.0,3.0,16.0,54.0,480.0,157.0,airport salzburg austria counter peopl flew au...,social medium post poi encourag peopl unfortun...,


### Topic Modeling

In [6]:
clean_articles = pf_clean['clean_article'].tolist()

In [7]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

representation_model = {
   "Main": KeyBERTInspired(),
   "POS":  [PartOfSpeech("en_core_web_sm"), MaximalMarginalRelevance(diversity=.6)],
   "Key_High": [KeyBERTInspired(top_n_words=15), MaximalMarginalRelevance(diversity=.8)]
}

topic_model = BERTopic(language="english", min_topic_size=20, 
                       representation_model=representation_model, verbose=True)
topics, probs = topic_model.fit_transform(clean_articles)

2024-02-07 04:11:17,156 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 733/733 [00:50<00:00, 14.62it/s]
2024-02-07 04:12:10,351 - BERTopic - Embedding - Completed ✓
2024-02-07 04:12:10,352 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-07 04:12:40,619 - BERTopic - Dimensionality - Completed ✓
2024-02-07 04:12:40,620 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-07 04:12:42,428 - BERTopic - Cluster - Completed ✓
2024-02-07 04:12:42,433 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-07 04:13:51,639 - BERTopic - Representation - Completed ✓


In [8]:
tf = topic_model.get_document_info(clean_articles)
tf.head()

Unnamed: 0,Document,Topic,Name,Representation,POS,Key_High,Representative_Docs,Top_n_words,Probability,Representative_document
0,viral oct social medium post claim israel lie ...,77,77_gaza_palestinian_israel_israelhama,"[gaza, palestinian, israel, israelhama, hama, ...","[palestinian, isra, gaza, video, post, meta, a...","[gaza, israelhama, hama, isra, militari, qatar...",[israel prepar ground incurs gaza week oct att...,gaza - palestinian - israel - israelhama - ham...,0.519364,False
1,wisconsin help swing presidenti vote donald tr...,0,0_wisconsin_walker_governor_madison,"[wisconsin, walker, governor, madison, democra...","[wisconsin, walker, state, milwauke, elect, pu...","[wisconsin, walker, governor, madison, republi...",[wisconsin help swing presidenti vote donald t...,wisconsin - walker - governor - madison - demo...,1.0,True
2,social medium post poi encourag peopl unfortun...,-1,-1_politifact_obama_republican_bill,"[politifact, obama, republican, bill, democrat...","[claim, octob, state, post, rate, year, tax, t...","[politifact, obama, republican, bill, presid, ...",[argument alway help quot rival seem undermin ...,politifact - obama - republican - bill - democ...,0.0,False
3,gaza health ministri said palestinian death to...,77,77_gaza_palestinian_israel_israelhama,"[gaza, palestinian, israel, israelhama, hama, ...","[palestinian, isra, gaza, video, post, meta, a...","[gaza, israelhama, hama, isra, militari, qatar...",[israel prepar ground incurs gaza week oct att...,gaza - palestinian - israel - israelhama - ham...,1.0,False
4,let clear air wind turbin compon deterior thre...,117,117_wind_windmil_coal_storm,"[wind, windmil, coal, storm, energi, solar, tu...","[wind, energi, turbin, power, grid, generat, t...","[windmil, coal, energi, solar, turbin, gasfir,...",[debat deepwat wind plan build two wind energi...,wind - windmil - coal - storm - energi - solar...,0.98695,False


In [9]:
topic_model.visualize_topics()

In [10]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(clean_articles, linkage_function=linkage_function)

topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 142/142 [01:07<00:00,  2.12it/s]


TODO: Manual Check & Merge Similar Topics

In [11]:
topics_per_class = topic_model.topics_per_class(clean_articles, classes=pf['target'])
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)

0it [00:00, ?it/s]

6it [00:36,  6.03s/it]


### Entity Extraction

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [13]:
ner_results = nlp(clean_articles[0].title())
ner_results

[{'entity': 'B-ORG',
  'score': 0.92359114,
  'index': 21,
  'word': 'Is',
  'start': 67,
  'end': 69},
 {'entity': 'I-ORG',
  'score': 0.6894298,
  'index': 23,
  'word': 'News',
  'start': 72,
  'end': 76},
 {'entity': 'I-ORG',
  'score': 0.87841475,
  'index': 24,
  'word': '##pa',
  'start': 76,
  'end': 78},
 {'entity': 'B-ORG',
  'score': 0.5864559,
  'index': 26,
  'word': 'Ha',
  'start': 80,
  'end': 82},
 {'entity': 'I-ORG',
  'score': 0.9808512,
  'index': 27,
  'word': '##aret',
  'start': 82,
  'end': 86},
 {'entity': 'I-ORG',
  'score': 0.9700665,
  'index': 28,
  'word': '##z',
  'start': 86,
  'end': 87},
 {'entity': 'B-ORG',
  'score': 0.9363211,
  'index': 29,
  'word': 'E',
  'start': 88,
  'end': 89},
 {'entity': 'I-ORG',
  'score': 0.53785574,
  'index': 30,
  'word': '##vid',
  'start': 89,
  'end': 92},
 {'entity': 'I-ORG',
  'score': 0.9174948,
  'index': 31,
  'word': 'Ha',
  'start': 93,
  'end': 95},
 {'entity': 'I-ORG',
  'score': 0.9607635,
  'index': 32,
 

TODO: NER Experiment with Text Retrieval.