In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from scipy.stats import zscore
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# nltk.download('punkt')

In [5]:
# import zipfile
# zip_file_path = 'politifact_data.zip'
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall()

In [8]:
poli_df = pd.read_csv('politifact_data.csv')
poli_df.head()

Unnamed: 0,media,when/where,content,label,speaker,documented_time,percentages,check_nums,summaries,article
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,false,Madison Czopek,"October 31, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],"['Haaretz, an Israeli newspaper, said on X tha...",A viral Oct. 28 social media post claimed that...
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,barely-true,Laura Schulte,"October 31, 2023",['12%' '21%' '18%' '19%' '21%' '5%'],[26 45 39 41 44 11],['Although Wisconsin has voted for more Democr...,"In 2016, Wisconsin helped to swing the preside..."
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",false,Ciara O'Rourke,"October 30, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],[],A social media post poised to encourage people...
3,Viral image,"stated on October 27, 2023 in an Instagram post:",Video shows Palestinians pretending to be corp...,false,Ciara O'Rourke,"October 30, 2023",['0%' '1%' '2%' '4%' '62%' '28%'],[ 4 13 35 53 745 336],['This video is 10 years old and shows student...,The Gaza Health Ministry has said the Palestin...
4,Facebook posts,"stated on September 25, 2023 in a Facebook post:",The life span of a wind tower generator lasts ...,false,Loreben Tuquero,"October 30, 2023",['0%' '1%' '4%' '9%' '59%' '23%'],[ 24 50 108 247 1519 594],['A study by energy industry experts showed th...,Let’s clear the air. Do wind turbine component...


In [191]:
shift_df = poli_df[['media', 'label', 'article']]
shift_df.head()

Unnamed: 0,media,label,article
0,Instagram posts,false,A viral Oct. 28 social media post claimed that...
1,Scott Walker,barely-true,"In 2016, Wisconsin helped to swing the preside..."
2,Instagram posts,false,A social media post poised to encourage people...
3,Viral image,false,The Gaza Health Ministry has said the Palestin...
4,Facebook posts,false,Let’s clear the air. Do wind turbine component...


In [192]:
from transformers import pipeline
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)



In [213]:
# 1 for drift, 0 for non-drift
def sentiment_score(result):
    numerical_scores = [scale[sentiment['label']] * sentiment['score'] for sentiment in result[0]]
    overall_score = sum(numerical_scores)
    return overall_score

def sentiment_shift(article):
    cleaned_text = re.sub(r'\xa0', ' ', article)
    cleaned_text = re.sub(r'\\', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
    cleaned_text = cleaned_text.strip()
    cleaned_text = re.sub(r'“|”', '"', cleaned_text)
    scale = {
        'positive' : 1,
        'neutral' : 0,
        'negative' : -1
    }
    data = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        # For now, trim sentence if longer than 512
        if len(sentence) > 512:
            sentence = sentence[:512]
        result = sentiment_score(distilled_student_sentiment_classifier(sentence))
        data.append(result)
    alpha = 0.05
    half = len(data)//2
    first_half = data[:half]
    second_half = data[half:]
    t_statistic, p_value = ttest_ind(first_half, second_half)
    if p_value < alpha:
        return 1
    else:
        return 0

In [198]:
# 1 for drift, 0 for non-drift
def topic_shift(article):
    cleaned_text = re.sub(r'\xa0', ' ', article)
    cleaned_text = re.sub(r'\\', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
    cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')
    cleaned_text = cleaned_text.strip()
    cleaned_text = re.sub(r'“|”', '"', cleaned_text)
    scale = {
        'positive' : 1,
        'neutral' : 0,
        'negative' : -1
    }
    data = []
    sentences = sent_tokenize(article)
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(dtm)
    topic_distribution = lda.transform(dtm)
    dominant_topic_per_document = topic_distribution.argmax(axis=1)
    half = len(dominant_topic_per_document) // 2
    epsilon = 1e-9
    first_half = dominant_topic_per_document[:half]
    second_half = dominant_topic_per_document[half:]
    min_value = min(min(first_half), min(second_half))
    max_value = max(max(first_half), max(second_half))
    histogram1, _ = np.histogram(first_half, bins=np.arange(min_value, max_value + 2))
    histogram2, _ = np.histogram(second_half, bins=np.arange(min_value, max_value + 2))
    contingency_table = np.array([histogram1, histogram2]) + epsilon
    _, p_value, _, _ = chi2_contingency(contingency_table)
    if p_value < 0.05:
        return 1
    else:
        return 0

In [200]:
shift_df['topic_drift'] = shift_df['article'].apply(topic_shift)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shift_df['topic_drift'] = shift_df['article'].apply(topic_shift)


In [221]:
from IPython.display import clear_output
sent_shift = []
for i in range(shift_df.shape[0]):
    result = sentiment_shift(shift_df.iloc[i]['article'])
    sent_shift.append(result)
    if i % 100 == 0:
        print(f'Iteration',i,'is done')
        clear_output(wait=True)

Iteration 25600 is done


In [225]:
shift_df['sentiment_drift'] = pd.Series(sent_shift)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shift_df['sentiment_drift'] = pd.Series(sent_shift)


In [227]:
shift_df.to_csv('context_shift_score.csv')

In [9]:
import zipfile
import os
def zip_file():
    file_to_zip = 'context_shift_score.csv'
    zip_file_name = 'context_shift_score.zip'
    with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(file_to_zip)
    print(f'{file_to_zip} has been zipped to {zip_file_name}')
    
    # Delete csv after zipping
    if os.path.exists(file_to_zip):
        os.remove(file_to_zip)
        print(f'{file_to_zip} has been deleted.')
    else:
        print(f'{file_to_zip} does not exist.')

In [10]:
zip_file()

context_shift_score.csv has been zipped to context_shift_score.zip
context_shift_score.csv has been deleted.
