In [1]:
# libraries🔥

# install🔥
%pip install transformers
%pip install torch
%pip install tf-keras # install this because keras needs to work on a previous version of python
%pip install scipy
%pip install pandas
%pip install tensorflow
%pip install -U sentence-transformers
%pip install numpy
%pip install sentencepiece
# %pip install transformers[sentencepiece] # didn't help, don't know if helpful


Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.24.0->transformers)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1

In [2]:
# import🔥
import tensorflow as tf
from transformers import BertTokenizer, BertModel, TFBertModel # for BERT
from transformers import AutoTokenizer, AutoModel # for DeBERTa
from transformers import DebertaV2Tokenizer, AutoModel # does something but need sentencepiece
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import sentencepiece

In [3]:
# read csv
file_path = "/work/Bachelor/all_articles_allsides_abortion.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,textID,words,date,country,source,url,headline,body,leaning
0,91719925,723.0,23-11-02,US,The Boston Globe,https://www.bostonglobe.com/2023/11/02/metro/d...,this band plans to drown out the men's march a...,"kirk israel, an activist musician and member o...",Left
1,91721731,1248.0,23-11-03,US,Fox News,https://www.foxnews.com/politics/virginias-ele...,virginia's elections a key 2024 barometer and ...,youngkin wore a similar red vest two years ago...,Right
2,103177268,1947.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/ideas/archive/2023...,here's what biden can do to change his grim po...,"whatever your theory, it should take into acco...",Left
3,103177271,1513.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/ideas/archive/2023...,don't equate anti-zionism with anti-semitism,"on october 7, the islamist militant group hama...",Left
4,103177279,2607.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/books/archive/2023...,do you have free will?,writing a review is an exercise in free will. ...,Left
...,...,...,...,...,...,...,...,...,...
8090,97824446,1244.0,23-03-28,US,bostonglobe.com,https://bostonglobe.com/2023/03/26/metro/rev-e...,"rev. elinor lockwood yeo, reproductive rights ...","rev. elinor lockwood yeo of newton, a reproduc...",Left
8091,97847400,648.0,23-03-29,US,sfchronicle.com,https://sfchronicle.com/opinion/letterstotheed...,letters: how the country can respond to nashvi...,"enough with the "" thoughts and prayers. "" thre...",Left
8092,97848056,918.0,23-03-29,US,bostonglobe.com,https://bostonglobe.com/2023/03/28/opinion/abo...,the latest antiabortion tactic: asserting the ...,"antiabortion activist trooper elwonger, 25, fe...",Left
8093,90760423,1224.0,23-03-31,US,The Daily Beast,https://www.thedailybeast.com/satan-wants-you-...,inside the horrific (contested) abuse story th...,how did one discredited biography ignite one o...,Left


In [5]:
# not fire

# the one🔥 (years)

# Load tokenizer and model
# DeBERTa-v3-large is used for its advanced contextual understanding
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword using a sliding window approach
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword using a sliding window approach.
    This handles large texts by breaking them into overlapping chunks to fit the model's input size.

    Parameters:
    - text (str): The full article or text to process.
    - keyword (str): The word to extract embeddings for.
    - max_len (int): Maximum length of tokens per chunk (default is 512 for DeBERTa).
    - stride (int): Number of overlapping tokens between chunks (default is 256).

    Returns:
    - np.array: The averaged contextual embedding for the keyword across all chunks.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()

    # Tokenize the text into overlapping chunks with padding and truncation
    tokens = tokenizer(
        text, 
        return_tensors='pt',  # Return tensors in PyTorch format
        padding=True,  # Pad tokens to max_len
        truncation=True,  # Truncate tokens exceeding max_len
        max_length=max_len,  # Limit chunk size to max_len (512 for most models)
        stride=stride,  # Define the overlap between chunks
        return_overflowing_tokens=True  # Allow creation of additional chunks for long text
    )
    
    embeddings = []  # List to store embeddings for the keyword from all chunks
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):  
        # Extract the specific chunk to pass into the model
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask', 'token_type_ids']}
        
        with torch.no_grad():  # Disable gradient computation for efficiency
            output = model(**chunk_tokens)  # Forward pass through the model
        
        # Tokenize the keyword and retrieve its token ID
        keyword_id = tokenizer.encode(keyword, add_special_tokens=False)[0]
        
        # Find the index of the keyword in the chunk
        keyword_index = (chunk_tokens['input_ids'] == keyword_id).nonzero(as_tuple=True)
        
        if len(keyword_index[1]) > 0:  # If the keyword is found in the chunk
            # Extract the embeddings for the keyword and average across its occurrences
            keyword_embedding = output.last_hidden_state[0, keyword_index[1], :].mean(dim=0).numpy()
            embeddings.append(keyword_embedding)  # Append the embedding for this chunk
    
    # Return the average embedding across all chunks if embeddings are found, else return None
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    This is done by calculating the average cosine distance between the two sets of embeddings.

    Parameters:
    - left_embeddings (list): List of embeddings for Left leaning articles.
    - right_embeddings (list): List of embeddings for Right leaning articles.

    Returns:
    - float: The average cosine distance (semantic polarity) between the two sets of embeddings.
    
    How it works:
    - Every embedding from Left articles is compared with every embedding from Right articles for a given year.
    - The total number of pairwise comparisons is m × n, where m is the number of Left embeddings and n is the number of Fox embeddings.
    - This ensures that all possible semantic relationships are captured.
    """
    total_distance = 0  # Initialize the sum of cosine distances
    count = 0  # Counter for the number of embedding pairs
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            # Compute cosine similarity and convert it to distance (1 - similarity)
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1  # Increment the count for each pair
    
    # Return the average distance if there are valid pairs, else return 0
    return total_distance / count if count > 0 else 0

# Analyze semantic polarity over time
df['year'] = pd.to_datetime(df['date']).dt.year  # Extract the year from the 'date' column
results = []  # List to store results for each year

# Group articles by year and analyze
for year, group in df.groupby('year'):  # Iterate through each year group
    left_group = group[group['leaning'] == 'Left']  # Filter Left articles for the current year
    right_group = group[group['leaning'] == 'Right']  # Filter Right articles for the current year
    
    # Extract embeddings for 'abortion' using the sliding window function
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    
    # Compute the semantic polarity score for the current year
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'year': year, 'semantic_polarity': sp_score})  # Store the results

# Convert the results to a DataFrame for easy visualization and analysis
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)  # Output the semantic polarity scores for each year


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

  df['year'] = pd.to_datetime(df['date']).dt.year  # Extract the year from the 'date' column


    year  semantic_polarity
0   2001           0.146246
1   2002           0.163964
2   2003           0.144314
3   2004           0.146139
4   2005           0.162829
5   2006           0.179563
6   2007           0.162130
7   2008           0.169943
8   2009           0.162576
9   2010           0.166860
10  2011           0.159802
11  2012           0.173202
12  2013           0.157998
13  2014           0.152847
14  2015           0.161099
15  2016           0.155210
16  2017           0.166148
17  2018           0.152493
18  2019           0.179476
19  2020           0.173120
20  2021           0.162476
21  2022           0.148312
22  2023           0.148737
23  2024           0.145665
24  2025           0.160252
25  2026           0.174160
26  2027           0.163742
27  2028           0.167304
28  2029           0.169089
29  2030           0.163533
30  2031           0.179741


### subsetting and testing

In [27]:
# Define years and leanings to filter
years = ['20', '21', '22', '23', '24']
leanings = ['Left', 'Right']

# Select rows matching the condition: year starts with '20', '21', etc., and one per leaning
final_df = pd.concat([
    df.loc[df['date'].str.startswith(year) & (df['leaning'] == leaning)].head(1)
    for year in years for leaning in leanings
])

# Reset index for clean output
final_df = final_df.reset_index(drop=True)

# Display the result
final_df

df = final_df

df

Unnamed: 0,textID,words,date,country,source,url,headline,body,leaning
0,32303732,933.0,20-11-04,US,buzzfeednews.com,https://www.buzzfeednews.com/article/carolinek...,this woman learned her birthmark is a sign she...,"twins often have an inseparable bond, but muhl...",Left
1,32303442,956.0,20-11-04,US,foxnews.com,https://www.foxnews.com/us/dc-activists-blm-pr...,protests erupt in philadelphia related to elec...,"the count every vote movement, which aims to e...",Right
2,87507637,945.0,21-07-01,US,The Boston Globe,https://www.bostonglobe.com/2021/07/01/arts/ba...,battling the patriarchy's censor in `the man w...,"it seems incredible now, when websites display...",Left
3,87504480,949.0,21-07-01,US,Townhall,https://townhall.com/columnists/jordanbrittain...,the christian faithful are rising up to fight ...,"this week, the united states conference of cat...",Right
4,94553862,1625.0,22-11-01,US,theatlantic.com,https://www.theatlantic.com/ideas/archive/2022...,"yes, elections have consequences",americans reputedly have short attention spans...,Left
5,90146792,2531.0,22-11-01,US,Fox News,https://www.foxnews.com/politics/fox-news-powe...,fox news power rankings: republicans expected ...,republicans are winning on the economy and cri...,Right
6,91719925,723.0,23-11-02,US,The Boston Globe,https://www.bostonglobe.com/2023/11/02/metro/d...,this band plans to drown out the men's march a...,"kirk israel, an activist musician and member o...",Left
7,91721731,1248.0,23-11-03,US,Fox News,https://www.foxnews.com/politics/virginias-ele...,virginia's elections a key 2024 barometer and ...,youngkin wore a similar red vest two years ago...,Right
8,107130862,3219.0,24-04-01,US,theatlantic.com,https://www.theatlantic.com/politics/archive/2...,ro khanna wants to be the future of the democr...,"in january, as the 2024 primary season got und...",Left
9,200353125,1003.0,24-04-01,US,foxnews.com,https://www.foxnews.com/lifestyle/easter-bunny...,"easter bunny not just a 'silly, secular rabbit...","if you doubt that, try to recall an ad for eas...",Right


In [29]:
# trying with BERT for the sample

import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword using a sliding window approach
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword using a sliding window approach.
    Handles BERT subword tokenization by matching keyword subword tokens.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()
    
    # Tokenize the keyword into subwords to handle BERT's tokenization
    keyword_ids = tokenizer.encode(keyword, add_special_tokens=False)

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Find the indices of the first keyword token in the chunk
        input_ids = chunk_tokens['input_ids'][0].tolist()
        for idx in range(len(input_ids) - len(keyword_ids) + 1):
            if input_ids[idx:idx + len(keyword_ids)] == keyword_ids:
                # Extract and average embeddings for keyword occurrences
                keyword_embedding = output.last_hidden_state[0, idx:idx + len(keyword_ids), :].mean(dim=0).numpy()
                embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Example DataFrame (replace this with your actual data)
# df = pd.read_csv('your_data_file.csv')  # Replace with actual data loading
data = {
    'date': ['21-01-01', '21-02-01', '22-01-01', '22-02-01'],
    'leaning': ['Left', 'Right', 'Left', 'Right'],
    'body': [
        'Abortion rights are under attack in some states.',
        'The abortion debate continues across the country.',
        'Pro-choice activists fight for abortion rights.',
        'Many citizens oppose abortion on moral grounds.'
    ]
}
df = pd.DataFrame(data)

# Correctly parse the 'date' column with the specified format
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')  # Specify the format 'yy-mm-dd'

# Drop rows where the date couldn't be parsed
df = df.dropna(subset=['date'])

# Extract the year for yearly analysis
df['year'] = df['date'].dt.year

# Initialize a list to store the results
results = []

# Group articles by year and analyze semantic polarity
for year, group in df.groupby('year'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    # Extract embeddings for the keyword 'abortion'
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') 
                       for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') 
                        for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    
    # Compute semantic polarity for the year
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'year': year, 'semantic_polarity': sp_score})

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)



   year  semantic_polarity
0  2021           0.606488
1  2022           0.221814


In [31]:
# maybe fire - yes fire! Years

import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword using a sliding window approach
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=256):
    """
    Extracts contextual embeddings for a specific keyword using a sliding window approach.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Retrieve keyword token ID
        keyword_id = tokenizer.encode(keyword, add_special_tokens=False)[0]
        
        # Find the index of the keyword
        keyword_index = (chunk_tokens['input_ids'] == keyword_id).nonzero(as_tuple=True)
        
        if len(keyword_index[1]) > 0:
            # Extract and average embeddings for keyword occurrences
            keyword_embedding = output.last_hidden_state[0, keyword_index[1], :].mean(dim=0).numpy()
            embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Correctly parse the 'date' column with the specified format
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')  # Specify the format 'yy-mm-dd'

# Drop rows where the date couldn't be parsed
df = df.dropna(subset=['date'])

# Extract the year for yearly analysis
df['year'] = df['date'].dt.year

# Initialize a list to store the results
results = []

# Group articles by year and analyze semantic polarity
for year, group in df.groupby('year'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    # Extract embeddings for the keyword 'abortion'
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') 
                       for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'abortion') 
                        for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'abortion') is not None]
    
    # Compute semantic polarity for the year
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'year': year, 'semantic_polarity': sp_score})

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)


   year  semantic_polarity
0  2020           0.167705
1  2021           0.155487
2  2022           0.159048
3  2023           0.165120
4  2024           0.171671


In [32]:
# Define the file path where you want to save the CSV🔥
output_file_path = "/work/Bachelor/results_for_plots/sp_df_stride256.csv"

# Save the filtered DataFrame as a CSV file
sp_df.to_csv(output_file_path, index=False, encoding='utf-8')

-----------------------

In [7]:
### 🔥🔥🔥🔥🔥🔥🔥
# maybe fire for months - yes fire!!!!!!!

import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword using a sliding window approach (no didn't do that)
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword using a sliding window approach.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Retrieve keyword token ID
        keyword_id = tokenizer.encode(keyword, add_special_tokens=False)[0]
        
        # Find the index of the keyword
        keyword_index = (chunk_tokens['input_ids'] == keyword_id).nonzero(as_tuple=True)
        
        if len(keyword_index[1]) > 0:
            # Extract and average embeddings for keyword occurrences
            keyword_embedding = output.last_hidden_state[0, keyword_index[1], :].mean(dim=0).numpy()
            embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Correctly parse the 'date' column with the specified format
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')  # Specify the format 'yy-mm-dd'

# Drop rows where the date couldn't be parsed
df = df.dropna(subset=['date'])

# Extract the year-month for monthly analysis
df['year_month'] = df['date'].dt.to_period('M')  # Creates 'YYYY-MM' format

# Initialize a list to store the results
results = []

# Group articles by month and analyze semantic polarity
for month, group in df.groupby('year_month'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    # Extract embeddings for the keyword 'abortion'
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'protesters') 
                       for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'protesters') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'protesters') 
                        for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'protesters') is not None]
    
    # Compute semantic polarity for the month
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'month': month.strftime('%Y-%m'), 'semantic_polarity': sp_score})

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)



      month  semantic_polarity
0   2020-11           0.164017
1   2020-12           0.000000
2   2021-01           0.250338
3   2021-02           0.156182
4   2021-03           0.000000
5   2021-04           0.000000
6   2021-05           0.000000
7   2021-06           0.226804
8   2021-07           0.161126
9   2021-08           0.000000
10  2021-09           0.195129
11  2021-10           0.232817
12  2021-11           0.171114
13  2021-12           0.181859
14  2022-01           0.186426
15  2022-02           0.230075
16  2022-03           0.193148
17  2022-04           0.159160
18  2022-05           0.182965
19  2022-06           0.199873
20  2022-07           0.180288
21  2022-08           0.161934
22  2022-09           0.133655
23  2022-10           0.199120
24  2022-11           0.184042
25  2022-12           0.247309
26  2023-01           0.335971
27  2023-02           0.000000
28  2023-03           0.000000
29  2023-04           0.168185
30  2023-05           0.297356
31  2023

In [8]:
# Define the file path where you want to save the CSV🔥
output_file_path = "/work/Bachelor/results_for_plots/protesters.csv"

# Save the filtered DataFrame as a CSV file
sp_df.to_csv(output_file_path, index=False, encoding='utf-8')

#### multi token

In [35]:
import re

# Preprocess the 'body' column in your dataframe
def standardize_roe_variants(text):
    """
    Replaces variations of 'roe v. wade', 'roe vs. wade', etc. with 'roe v wade'.
    """
    text = re.sub(r'roe\s+v[.\s]*wade', 'roe v wade', text, flags=re.IGNORECASE)
    text = re.sub(r'roe\s+vs[.\s]*wade', 'roe v wade', text, flags=re.IGNORECASE)
    return text

# Apply the function to the 'body' column
df['body'] = df['body'].apply(standardize_roe_variants)


In [36]:
# chat went wild - maybe this runs multiple keywords at once...

import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel
import re

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Preprocess 'body' column to standardize 'roe v wade' variations
def standardize_roe_variants(text):
    """
    Replaces variations of 'roe v. wade', 'roe vs. wade', etc. with 'roe v wade'.
    """
    text = re.sub(r'roe\s+v[.\s]*wade', 'roe v wade', text, flags=re.IGNORECASE)
    text = re.sub(r'roe\s+vs[.\s]*wade', 'roe v wade', text, flags=re.IGNORECASE)
    return text

# Function to extract contextual embedding for a specific keyword or multi-word phrase
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword or multi-word phrase using a sliding window approach.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()
    
    # Tokenize the keyword into subwords to handle multi-word phrases
    keyword_ids = tokenizer.encode(keyword, add_special_tokens=False)

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Find the indices of the first occurrence of the phrase (sequence of token IDs)
        input_ids = chunk_tokens['input_ids'][0].tolist()
        for idx in range(len(input_ids) - len(keyword_ids) + 1):
            if input_ids[idx:idx + len(keyword_ids)] == keyword_ids:
                # Extract and average embeddings for the entire phrase
                keyword_embedding = output.last_hidden_state[0, idx:idx + len(keyword_ids), :].mean(dim=0).numpy()
                embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Load the DataFrame
# df = pd.read_csv("path_to_your_file.csv")  # Replace with your file path

# Example preprocessing: Apply standardization to the 'body' column
df['body'] = df['body'].apply(standardize_roe_variants)

# Extract the year-month for monthly analysis
df['year_month'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce').dt.to_period('M')
df = df.dropna(subset=['year_month'])  # Drop rows where 'year_month' couldn't be parsed

# List of keywords/phrases to analyze
keywords = ['roe v wade', 'abortion', 'pro-choice', 'pro-life']

# Initialize a list to store the results
results = []

# Group articles by month and analyze semantic polarity for each keyword
for month, group in df.groupby('year_month'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    for keyword in keywords:
        # Extract embeddings for the keyword
        left_embeddings = [get_contextual_embedding_sliding_window(body, keyword) 
                           for body in left_group['body'] if get_contextual_embedding_sliding_window(body, keyword) is not None]
        right_embeddings = [get_contextual_embedding_sliding_window(body, keyword) 
                            for body in right_group['body'] if get_contextual_embedding_sliding_window(body, keyword) is not None]
        
        # Compute semantic polarity for the month
        if left_embeddings and right_embeddings:
            sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
            results.append({
                'month': month.strftime('%Y-%m'),
                'keyword': keyword,
                'semantic_polarity': sp_score
            })

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)


KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Function to extract contextual embedding for a specific multi-word phrase
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword or multi-word phrase using a sliding window approach.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()
    
    # Tokenize the keyword into subwords to handle multi-word phrases
    keyword_ids = tokenizer.encode(keyword, add_special_tokens=False)

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Find the indices of the first occurrence of the phrase (sequence of token IDs)
        input_ids = chunk_tokens['input_ids'][0].tolist()
        for idx in range(len(input_ids) - len(keyword_ids) + 1):
            if input_ids[idx:idx + len(keyword_ids)] == keyword_ids:
                # Extract and average embeddings for the entire phrase
                keyword_embedding = output.last_hidden_state[0, idx:idx + len(keyword_ids), :].mean(dim=0).numpy()
                embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Extract the year-month for monthly analysis
df['year_month'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce').dt.to_period('M')
df = df.dropna(subset=['year_month'])  # Drop rows where 'year_month' couldn't be parsed

# Initialize a list to store the results
results = []

# Group articles by month and analyze semantic polarity for "roe v wade"
for month, group in df.groupby('year_month'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    # Extract embeddings for "roe v wade"
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'roe v wade') 
                       for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'roe v wade') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'roe v wade') 
                        for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'roe v wade') is not None]
    
    # Compute semantic polarity for the month
    if left_embeddings and right_embeddings:
        sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
        results.append({
            'month': month.strftime('%Y-%m'),
            'semantic_polarity': sp_score
        })

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)

# Optional: Save to CSV
# sp_df.to_csv("semantic_polarity_roe_v_wade.csv", index=False)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

     month  semantic_polarity
0  2022-05           0.187319
1  2022-06           0.171925
2  2022-07           0.155236
3  2022-08           0.133299
4  2022-09           0.202351
5  2022-11           0.247092


In [5]:
# Define the file path where you want to save the CSV🔥
output_file_path = "/work/Bachelor/results_for_plots/sp_roevwade_df.csv"

# Save the filtered DataFrame as a CSV file
sp_df.to_csv(output_file_path, index=False, encoding='utf-8')

In [4]:
import pandas as pd
import re

# Replace "pro-choice" or "pro choice" with "prochoice"
#df['body'] = df['body'].str.replace(r'\bpro[-\s]?choice\b', 'prochoice', flags=re.IGNORECASE, regex=True)

# Replace "pro-life" or "pro life" with "prolife"
df['body'] = df['body'].str.replace(r'\bpro[-\s]?life\b', 'prolife', flags=re.IGNORECASE, regex=True)

# Display the updated DataFrame
df


Unnamed: 0,textID,words,date,country,source,url,headline,body,leaning
0,91719925,723.0,23-11-02,US,The Boston Globe,https://www.bostonglobe.com/2023/11/02/metro/d...,this band plans to drown out the men's march a...,"kirk israel, an activist musician and member o...",Left
1,91721731,1248.0,23-11-03,US,Fox News,https://www.foxnews.com/politics/virginias-ele...,virginia's elections a key 2024 barometer and ...,youngkin wore a similar red vest two years ago...,Right
2,103177268,1947.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/ideas/archive/2023...,here's what biden can do to change his grim po...,"whatever your theory, it should take into acco...",Left
3,103177271,1513.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/ideas/archive/2023...,don't equate anti-zionism with anti-semitism,"on october 7, the islamist militant group hama...",Left
4,103177279,2607.0,23-11-03,US,inquisitr.com,https://www.theatlantic.com/books/archive/2023...,do you have free will?,writing a review is an exercise in free will. ...,Left
...,...,...,...,...,...,...,...,...,...
8090,97824446,1244.0,23-03-28,US,bostonglobe.com,https://bostonglobe.com/2023/03/26/metro/rev-e...,"rev. elinor lockwood yeo, reproductive rights ...","rev. elinor lockwood yeo of newton, a reproduc...",Left
8091,97847400,648.0,23-03-29,US,sfchronicle.com,https://sfchronicle.com/opinion/letterstotheed...,letters: how the country can respond to nashvi...,"enough with the "" thoughts and prayers. "" thre...",Left
8092,97848056,918.0,23-03-29,US,bostonglobe.com,https://bostonglobe.com/2023/03/28/opinion/abo...,the latest antiabortion tactic: asserting the ...,"antiabortion activist trooper elwonger, 25, fe...",Left
8093,90760423,1224.0,23-03-31,US,The Daily Beast,https://www.thedailybeast.com/satan-wants-you-...,inside the horrific (contested) abuse story th...,how did one discredited biography ignite one o...,Left


In [None]:
### 🔥🔥🔥🔥🔥🔥🔥
# maybe fire for months - yes fire!!!!!!!
# did not do sliding window approach here

import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword using a sliding window approach
def get_contextual_embedding_sliding_window(text, keyword, max_len=512, stride=512):
    """
    Extracts contextual embeddings for a specific keyword using a sliding window approach.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()

    # Tokenize the text into overlapping chunks
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len,
        stride=stride,
        return_overflowing_tokens=True
    )
    
    embeddings = []  # Store embeddings for the keyword
    
    # Loop through each chunk of tokens
    for i in range(tokens['input_ids'].size(0)):
        chunk_tokens = {key: val[i].unsqueeze(0) for key, val in tokens.items() if key in ['input_ids', 'attention_mask']}
        
        with torch.no_grad():
            output = model(**chunk_tokens)  # Forward pass
        
        # Retrieve keyword token ID
        keyword_id = tokenizer.encode(keyword, add_special_tokens=False)[0]
        
        # Find the index of the keyword
        keyword_index = (chunk_tokens['input_ids'] == keyword_id).nonzero(as_tuple=True)
        
        if len(keyword_index[1]) > 0:
            # Extract and average embeddings for keyword occurrences
            keyword_embedding = output.last_hidden_state[0, keyword_index[1], :].mean(dim=0).numpy()
            embeddings.append(keyword_embedding)
    
    # Return the average embedding across all chunks
    return np.mean(embeddings, axis=0) if len(embeddings) > 0 else None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0
    
    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1
    
    return total_distance / count if count > 0 else 0

# Correctly parse the 'date' column with the specified format
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')  # Specify the format 'yy-mm-dd'

# Drop rows where the date couldn't be parsed
df = df.dropna(subset=['date'])

# Extract the year-month for monthly analysis
df['year_month'] = df['date'].dt.to_period('M')  # Creates 'YYYY-MM' format

# Initialize a list to store the results
results = []

# Group articles by month and analyze semantic polarity
for month, group in df.groupby('year_month'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles
    
    # Extract embeddings for the keyword 'abortion'
    left_embeddings = [get_contextual_embedding_sliding_window(body, 'prolife') 
                       for body in left_group['body'] if get_contextual_embedding_sliding_window(body, 'prolife') is not None]
    right_embeddings = [get_contextual_embedding_sliding_window(body, 'prolife') 
                        for body in right_group['body'] if get_contextual_embedding_sliding_window(body, 'prolife') is not None]
    
    # Compute semantic polarity for the month
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'month': month.strftime('%Y-%m'), 'semantic_polarity': sp_score})

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

In [None]:
# Define the file path where you want to save the CSV🔥
output_file_path = "/work/Bachelor/results_for_plots/sp_prolife3_df.csv"

# Save the filtered DataFrame as a CSV file
sp_df.to_csv(output_file_path, index=False, encoding='utf-8')

## Without sliding window approach

In [8]:
import pandas as pd
import re

# Replace "pro-life" or "pro life" with "prolife"
#df['body'] = df['body'].str.replace(r'\bpro[-\s]?life\b', 'prolife', flags=re.IGNORECASE, regex=True)

df['body'] = df['body'].str.replace(r'\bplanned\s?parenthood\b', 'plannedparenthood', flags=re.IGNORECASE, regex=True)

# Filter rows where "reproductivehealth" appears in the "body" column
#reproductivehealth_rows = df[df['body'].str.contains(r'\breproductivehealth\b', flags=re.IGNORECASE, regex=True)]

# Replace variations of "Dobbs v Jackson" with "dobbsvjackson"
#df['body'] = df['body'].str.replace(
#    r'\bdobbs\s(v\.?|vs\.?)\sjackson\b', 
#    'dobbsvjackson', 
#    flags=re.IGNORECASE, 
#    regex=True
#)

# Filter rows where "reproductivehealth" appears in the "body" column
plannedparenthood_rows = df[df['body'].str.contains(r'\bplannedparenthood\b', flags=re.IGNORECASE, regex=True)]


# Display the filtered rows
plannedparenthood_rows



Unnamed: 0,textID,words,date,country,source,url,headline,body,leaning
0,91719925,723.0,23-11-02,US,The Boston Globe,https://www.bostonglobe.com/2023/11/02/metro/d...,this band plans to drown out the men's march a...,"kirk israel, an activist musician and member o...",Left
12,91734214,1651.0,23-11-06,US,The Atlantic,https://www.theatlantic.com/ideas/archive/2023...,how is child marriage still legal in the u.s.?,"this past spring, as part of my work teaching ...",Left
34,91741169,276.0,23-11-08,US,Fox News,https://www.foxnews.com/us/pro-life-activist-m...,pro-life activist mark houck files lawsuit aga...,a catholic activist is suing the justice depar...,Right
40,103306212,2098.0,23-11-08,US,marketwatch.com,https://www.theatlantic.com/ideas/archive/2023...,why abortion rights keep winning in red states,abortion foes thought roe v. wade'sreversal wo...,Left
83,87505877,654.0,21-07-01,US,CNSNews.com,https://cnsnews.com/article/washington/ashlian...,is a 15-week-old unborn baby a human being? se...,( cns news ) -- when asked if a 15-week-old un...,Right
...,...,...,...,...,...,...,...,...,...
8046,97525942,1495.0,23-03-14,US,bostonglobe.com,https://bostonglobe.com/2023/03/13/metro/patri...,"patricia schroeder, congresswoman who wielded ...","u.s. rep. pat schroeder, d-colo., sat on the p...",Left
8072,90728356,463.0,23-03-21,US,Fox News,https://www.foxnews.com/politics/smuggling-abo...,smuggling the abortion pill into texas could l...,lawyers who filed a lawsuit challenging the fd...,Right
8073,97697350,1793.0,23-03-22,US,theatlantic.com,https://www.theatlantic.com/ideas/archive/2023...,the malthusians are back,scolding regular people for contributing to cl...,Left
8084,90739332,529.0,23-03-25,US,Fox News,https://www.foxnews.com/politics/dems-want-fun...,dems want to fund abortion `everywhere' in the...,judicial crisis network president carrie sever...,Right


In [9]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DebertaV2Tokenizer, AutoModel

# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-large')
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

# Define the case-insensitive tokenizer function
def case_insensitive_tokenizer(text):
    """
    Converts the input text to lowercase before tokenizing.
    Ensures consistent tokenization for uppercase and lowercase variations.
    """
    return tokenizer(text.lower())

# Function to extract contextual embedding for a specific keyword
def get_contextual_embedding(text, keyword, max_len=512):
    """
    Extracts contextual embeddings for a specific keyword within a single chunk of text.
    """
    # Convert text and keyword to lowercase for case-insensitive processing
    text = text.lower()
    keyword = keyword.lower()

    # Tokenize the text
    tokens = tokenizer(
        text, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_len
    )

    with torch.no_grad():
        output = model(**tokens)  # Forward pass

    # Retrieve keyword token ID
    keyword_id = tokenizer.encode(keyword, add_special_tokens=False)[0]

    # Find the index of the keyword
    keyword_index = (tokens['input_ids'] == keyword_id).nonzero(as_tuple=True)

    if len(keyword_index[1]) > 0:
        # Extract and average embeddings for keyword occurrences
        keyword_embedding = output.last_hidden_state[0, keyword_index[1], :].mean(dim=0).numpy()
        return keyword_embedding
    return None

# Function to compute semantic polarity between two sets of embeddings
def compute_semantic_polarity(left_embeddings, right_embeddings):
    """
    Computes the semantic polarity score between Left and Right embeddings.
    """
    total_distance = 0
    count = 0

    for left_emb in left_embeddings:
        for right_emb in right_embeddings:
            distance = 1 - cosine_similarity(left_emb.reshape(1, -1), right_emb.reshape(1, -1))[0][0]
            total_distance += distance
            count += 1

    return total_distance / count if count > 0 else 0

# Correctly parse the 'date' column with the specified format
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')  # Specify the format 'yy-mm-dd'

# Drop rows where the date couldn't be parsed
df = df.dropna(subset=['date'])

# Extract the year-month for monthly analysis
df['year_month'] = df['date'].dt.to_period('M')  # Creates 'YYYY-MM' format

# Initialize a list to store the results
results = []

# Group articles by month and analyze semantic polarity
for month, group in df.groupby('year_month'):
    left_group = group[group['leaning'] == 'Left']  # Filter Left-leaning articles
    right_group = group[group['leaning'] == 'Right']  # Filter Right-leaning articles

    # Extract embeddings for the keyword 'prolife'
    left_embeddings = [get_contextual_embedding(body, 'plannedparenthood') 
                       for body in left_group['body'] if get_contextual_embedding(body, 'plannedparenthood') is not None]
    right_embeddings = [get_contextual_embedding(body, 'plannedparenthood') 
                        for body in right_group['body'] if get_contextual_embedding(body, 'plannedparenthood') is not None]

    # Compute semantic polarity for the month
    sp_score = compute_semantic_polarity(left_embeddings, right_embeddings)
    results.append({'month': month.strftime('%Y-%m'), 'semantic_polarity': sp_score})

# Convert the results to a DataFrame
sp_df = pd.DataFrame(results)

# Display the results
print(sp_df)


      month  semantic_polarity
0   2020-11           0.413202
1   2020-12           0.392815
2   2021-01           0.327477
3   2021-02           0.326952
4   2021-03           0.315540
5   2021-04           0.337806
6   2021-05           0.407169
7   2021-06           0.313151
8   2021-07           0.295049
9   2021-08           0.349062
10  2021-09           0.237982
11  2021-10           0.211966
12  2021-11           0.270067
13  2021-12           0.199048
14  2022-01           0.357740
15  2022-02           0.325508
16  2022-03           0.344030
17  2022-04           0.399499
18  2022-05           0.317165
19  2022-06           0.283699
20  2022-07           0.302890
21  2022-08           0.368290
22  2022-09           0.309218
23  2022-10           0.332218
24  2022-11           0.283501
25  2022-12           0.356725
26  2023-01           0.369280
27  2023-02           0.427606
28  2023-03           0.430760
29  2023-04           0.432449
30  2023-05           0.390187
31  2023

In [10]:
# Define the file path where you want to save the CSV🔥
output_file_path = "/work/Bachelor/results_for_plots/sp_plannedparenthood_df.csv"

# Save the filtered DataFrame as a CSV file
sp_df.to_csv(output_file_path, index=False, encoding='utf-8')