In [12]:
from transformers import pipeline
import pandas as pd
import ast

In [13]:
data = pd.read_csv(
    "/Users/sofiepalmuskronborg/Desktop/Speciale/Data/deberta_top_labels.csv",
    index_col=0,
)

In [3]:
zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33",
)

In [4]:
gov_sample = data[data["top_label"] == "Government"].sample(n=1000)
gov_sample = gov_sample.drop(['labels','scores', 'top_label', 'top_score'],axis=1)
# gov_sample.to_csv("1000_gov.csv")

In [15]:
hypothesis_template = "This ad is about {}"
classes_verbalized = [
    "Economy",
    "Civil Rights",
    "Healthcare",
    "Agriculture",
    "Labor and Employment",
    "Education and Culture",
    "Climate",
    "Immigration",
    "Transport",
    "Law and Crime",
    "Social Welfare",
    "Housing",
    "Defense",
    "Foreign Affair",
    "Government",
    "Call for action"  # new
    "Other",  # placeholder category
]  #

# output = zeroshot_classifier(
#    text, classes_verbalized, hypothesis_template=hypothesis_template, multi_label=False
# )
# print(output)

In [16]:
text_to_vector = {}  # dict to store the model outcome

for text in gov_sample["ad_creative_body"]:
    hypo = f"This ad is about {{}}"
    output = zeroshot_classifier(
        text,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False,
    )
    text_to_vector[text] = {
        "labels": output[
            "labels"
        ],  # keep all labels and their corresponding scores in a list format
        "scores": output["scores"],
    }

# Making two new columns
gov_sample["labels"] = gov_sample["ad_creative_body"].map(
    lambda text: text_to_vector[text]["labels"]
)
gov_sample["scores"] = gov_sample["ad_creative_body"].map(
    lambda text: text_to_vector[text]["scores"]
)

In [18]:
# sort away duplicate ad texts coming from the same politician
gov_label_data_unik = gov_sample.drop_duplicates(subset=["ad_creative_body", "page_id"])

In [20]:
# converts 'labels' and 'scores' column from object to a list to be able to retrieve the first instance -> top score/top_label

gov_label_data_unik["labels"] = gov_label_data_unik["labels"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
gov_label_data_unik["scores"] = gov_label_data_unik["scores"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Retrieving first instance in each column
gov_label_data_unik["top_label"] = gov_label_data_unik["labels"].str[0]
gov_label_data_unik["top_score"] = gov_label_data_unik["scores"].str[0]

In [20]:
top_labels_sample = (gov_label_data_unik["top_label"].value_counts()).iloc[
    :16
]  # all labels for sample

# calcu the amount + % for each top
top_percentages = (
    gov_label_data_unik["top_label"].value_counts(normalize=True).iloc[:16]
) * 100

# Print the total counts and %
print("Stats for Gov sample with addition of 'Call for Action' category:\n")
for label, count, percentage in zip(
    top_labels_sample.index, top_labels_sample.values, top_percentages.values
):
    print(f"{label}: {count} ads ({percentage:.2f}%)")

Stats for Gov sample with addition of 'Call for Action' category:

Call for action: 555 ads (55.50%)
Government: 441 ads (44.10%)
Transport: 3 ads (0.30%)
Healthcare: 1 ads (0.10%)


In [32]:
# gov_label_data_unik.to_csv("1000_sample_gov_cfa.csv")