In [2]:
from transformers import pipeline
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DebertaV2Tokenizer


2024-10-16 11:35:01.779698: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


KeyboardInterrupt: 

In [55]:
data = pd.read_csv("filtered_data.csv", index_col=0)  # 459.728 entries
unique_ads = data.drop_duplicates(
    subset=["ad_creative_body"]
)  # # print(unique_txt.size) 58.449 unique ad text bites to proces

## checking the length of tokens from the dataframe

In [47]:
# Load the tokenizer for DebertaV2
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xlarge")

max_length = 512  # max length of text/tokes for DeBERTa

above_limit = 0
below_limit = 0
more_than_double = 0
double_above_limit = 0
triple_above_limit = 0

for text in unique_ads["ad_creative_body"]:
    tokens = tokenizer.encode(
        text, add_special_tokens=True
    )  # Tokenize the ad text using DebertaV2
    token_length = len(tokens)

    if token_length > max_length:
        above_limit += 1
        if token_length >= 1536 and token_length < 2047:
            triple_above_limit += 1
        if token_length >= 1024 and token_length <= 1536:
            double_above_limit += 1
        if token_length <= 1024:
            more_than_double +=1
    else:
        below_limit += 1

In [48]:
print(f"Amount of ads within the limit: {below_limit}")
print(f"Amount of ads above the limit: {above_limit}")
print(f"Ads above the limit but below 1024 tokens: {more_than_double}") # split with 2
print(f"Ads above 1024 tokens but below 1536 tokens: {double_above_limit}")  # split with 3
print(f"Ads above 1536 tokens but below 2047 tokens: {triple_above_limit}")  # split with 4


Amount of ads within the limit: 58381
Amount of ads above the limit: 68
Ads above the limit but below 1024 tokens: 60
Ads above 1024 tokens but below 1536 tokens: 5
Ads above 1536 tokens but below 2047 tokens: 3


## Running the ads through DeBERTa

In [14]:
zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33",
)

In [15]:
hypothesis_template = "This ad is about {}"
classes_verbalized = [
    "Economy",
    "Civil Rights",
    "Healthcare",
    "Agriculture",
    "Labor and Employment",
    "Education and Culture",
    "Climate",
    "Immigration",
    "Transport",
    "Law and Crime",
    "Social Welfare",
    "Housing",
    "Defense",
    "Foreign Affair",
    "Call for Action",
    "Other"   #placeholder category
] 

In [56]:
# list to store the number of tokens for each ad
num_tokens_list = []

# Tokenize each ad_creative_body and get the number of tokens
for text in unique_ads["ad_creative_body"]:
    tokens = tokenizer.encode(text, add_special_tokens=False)  # Tokenize the text
    num_tokens_list.append(len(tokens)) 

unique_ads["num_tokens"] = num_tokens_list # make new column with the token count for each ad

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_ads["num_tokens"] = num_tokens_list


In [57]:
# Split into two df based on limitation
below_limit = unique_ads[unique_ads["num_tokens"] <= 512]
above_limit = unique_ads[unique_ads["num_tokens"] > 512]

# below_limit.to_csv("under_512.csv")
#above_limit.to_csv("aboev_512.csv")

## processing text below the limit

In [52]:
# Process ads within limit
text_to_vector_below = {}
for text in below_limit["ad_creative_body"]:
    output = zeroshot_classifier(
        text,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False,
    )
    text_to_vector_below[text] = {
        "labels": output["labels"],
        "scores": output["scores"],
    }

# Making two new columns
below_limit["labels"] = below_limit["ad_creative_body"].map(
    lambda text: text_to_vector_below[text]["labels"]
)
below_limit["scores"] = below_limit["ad_creative_body"].map(
    lambda text: text_to_vector_below[text]["scores"]
)

## processing text above limit (complicated one)

In [None]:
text_to_vector_above = (
    {}
)  # dict to store labels + scores for each ad abover # 512 tokens

for text in above_limit["ad_creative_body"]:

    total_scores = []  # list to keep scres for each split
    all_labels = None  # will store labels

    tokens = tokenizer.encode(text, add_special_tokens=True)

    total_splits = (  # figure out how many splits in terms of max l
        len(tokens) + max_length - 1
    ) // max_length  # // rounding to a single int value

    split_size = (  # figure out how to do an equal split on the total tokens (as equal as possible)
        len(tokens) + total_splits - 1
    ) // total_splits  # // rounding to a single int value

    for i in range(0, len(tokens), split_size):
        chunk = tokens[i : i + split_size]
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)

        output = zeroshot_classifier(
            chunk_text,
            classes_verbalized,
            hypothesis_template=hypothesis_template,
            multi_label=False,
        )

        # gather the scores
        if all_labels is None:
            all_labels = output["labels"]  # store labels on first text bit
        total_scores.append(output["scores"])  # Collect all scores for avg

    # Average the scores across all chunks
    avg_scores = [sum(s) / len(total_scores) for s in zip(*total_scores)]

    text_to_vector_above[text] = {
        "labels": all_labels,  # Use the labels from run throough of the text bit
        "scores": avg_scores,
    }

# 2 new columns
data["labels"] = data["ad_creative_body"].map(
    lambda text: text_to_vector_above[text]["labels"]
)
data["scores"] = data["ad_creative_body"].map(
    lambda text: text_to_vector_above[text]["scores"]
)

## store results from both dataframes back to org DataFrame


In [None]:
unique_ads["labels"] = unique_ads["ad_creative_body"].map(
    lambda text: text_to_vector_below.get(text, text_to_vector_above.get(text, {})).get(
        "labels", []
    )
)
unique_ads["scores"] = unique_ads["ad_creative_body"].map(
    lambda text: text_to_vector_above.get(text, text_to_vector_above.get(text, {})).get(
        "scores", []
    )
)

## Some basic stats on labelled data

In [52]:
# sort away duplicate ad texts coming from the same politician
#label_data_unique = label_data.drop_duplicates(subset=['ad_creative_body', 'page_id'])

In [53]:
#label_data_unique.info() # from 459.728 ads to 58.574 ads !!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58574 entries, 0 to 58573
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0.2              58574 non-null  int64  
 1   Unnamed: 0.1              58574 non-null  int64  
 2   Unnamed: 0                58574 non-null  int64  
 3   ad_creation_time          58574 non-null  object 
 4   ad_creative_body          58574 non-null  object 
 5   spend                     58574 non-null  float64
 6   impressions               58574 non-null  float64
 7   delivery_by_region        58574 non-null  object 
 8   demographic_distribution  58574 non-null  object 
 9   page_id                   58574 non-null  int64  
 10  page_name                 58574 non-null  object 
 11  bylines                   58339 non-null  object 
 12  id                        58574 non-null  int64  
 13  spend_lo                  58574 non-null  int64  
 14  spend_

In [54]:
# converts 'labels' and 'scores' column from object to a list to be able to retrieve the first instance -> top score/top_label

label_data_unique["labels"] = label_data_unique["labels"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
label_data_unique["scores"] = label_data_unique["scores"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Retrieving first instance in each column
label_data_unique["top_label"] = label_data_unique["labels"].str[0]
label_data_unique["top_score"] = label_data_unique["scores"].str[0]

In [39]:
# saving another CSV file with the two new columns
label_data_unique.to_csv("deberta_top_labels.csv")