In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from scipy.stats import zscore
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import gdown
import torch
# nltk.download('punkt')

In [2]:
gdown.download(f'https://drive.google.com/uc?id=1hHS3oUW7H1KdC341LeYAsroaJiWWcnL7', "../../data/politifact_data.csv", quiet=True)
shift_df = pd.read_csv('../../data/politifact_data.csv')[['media', 'article', 'label']]

In [3]:
# 1 for drift, 0 for non-drift
def sentiment_score(result):
    numerical_scores = [scale[sentiment['label']] * sentiment['score'] for sentiment in result[0]]
    overall_score = sum(numerical_scores)
    return overall_score

def sentiment_shift(article):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    distilled_student_sentiment_classifier = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        return_all_scores=True,
        device=device
    )
    cleaned_text = re.sub(r'\xa0', ' ', article)
    cleaned_text = re.sub(r'\\', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
    cleaned_text = cleaned_text.strip()
    cleaned_text = re.sub(r'“|”', '"', cleaned_text)
    scale = {
        'positive' : 1,
        'neutral' : 0,
        'negative' : -1
    }
    data = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        # For now, trim sentence if longer than 512
        if len(sentence) > 512:
            sentence = sentence[:512]
        result = sentiment_score(distilled_student_sentiment_classifier(sentence))
        data.append(result)
    alpha = 0.05
    half = len(data)//2
    first_half = data[:half]
    second_half = data[half:]
    t_statistic, p_value = ttest_ind(first_half, second_half)
    if p_value < alpha:
        return 1
    else:
        return 0

In [4]:
# 1 for drift, 0 for non-drift
def topic_shift(article):
    cleaned_text = re.sub(r'\xa0', ' ', article)
    cleaned_text = re.sub(r'\\', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
    cleaned_text = cleaned_text.strip()
    cleaned_text = re.sub(r'“|”', '"', cleaned_text)
    scale = {
        'positive' : 1,
        'neutral' : 0,
        'negative' : -1
    }
    data = []
    sentences = sent_tokenize(article)
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)
    topic_distribution = lda.transform(dtm)
    dominant_topic_per_document = topic_distribution.argmax(axis=1)
    half = len(dominant_topic_per_document) // 2
    epsilon = 1e-9
    first_half = dominant_topic_per_document[:half]
    second_half = dominant_topic_per_document[half:]
    min_value = min(min(first_half), min(second_half))
    max_value = max(max(first_half), max(second_half))
    histogram1, _ = np.histogram(first_half, bins=np.arange(min_value, max_value + 2))
    histogram2, _ = np.histogram(second_half, bins=np.arange(min_value, max_value + 2))
    contingency_table = np.array([histogram1, histogram2]) + epsilon
    _, p_value, _, _ = chi2_contingency(contingency_table)
    if p_value < 0.05:
        return 1
    else:
        return 0

In [5]:
import re
import nltk
from nltk import sent_tokenize, pos_tag, ne_chunk
def perform_ner(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    named_entities = ne_chunk(pos_tags)
    return [entity for entity in named_entities if isinstance(entity, nltk.Tree)]

def tree_to_string(tree):
    if isinstance(tree, nltk.Tree):
        return ' '.join([tree_to_string(child) for child in tree])
    else:
        return tree[0]
def ner_shift(article):
    def clean_text(text):
        cleaned_text = re.sub(r'\xa0', ' ', text)
        cleaned_text = re.sub(r'\\', '', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
        cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
        cleaned_text = cleaned_text.strip()
        cleaned_text = re.sub(r'“|”', '"', cleaned_text)
        return cleaned_text

    sentences = sent_tokenize(article)
    sentences_length = len(sentences)
    half_index = sentences_length // 2
    first_half = ' '.join(sentences[:half_index])
    second_half = ' '.join(sentences[half_index:])
    cleaned_first_half = clean_text(first_half)
    cleaned_second_half = clean_text(second_half)
    entities_first_half = [tree_to_string(entity) for entity in perform_ner(cleaned_first_half)]
    entities_second_half = [tree_to_string(entity) for entity in perform_ner(cleaned_second_half)]
    ner_shift_count = len(set(entities_second_half) - set(entities_first_half))
    return ner_shift_count



In [None]:
from IPython.display import clear_output
shift_df['topic_drift'] = shift_df['article'].apply(topic_shift)
sent_shift = []
for i in range(shift_df.shape[0]):
    result = sentiment_shift(shift_df.iloc[i]['article'])
    sent_shift.append(result)
    if i % 100 == 0:
        print(f'Iteration',i,'is done')
        clear_output(wait=True)
shift_df['sentiment_drift'] = pd.Series(sent_shift)
shift_df['ner_shift_count'] = shift_df['article'].apply(ner_shift)

In [None]:
# shift_df.to_csv('context_shift_score.csv')