In [1]:
import os
import pandas as pd
import altair as alt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/hbbg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data_root = "../data/multipride_data/"
figures_root = "../figures/"
os.makedirs(figures_root, exist_ok=True)

train_files = [file for file in os.listdir(data_root) if (file.endswith(".csv") and ("train" in file))]
train_files

['train_en.csv', 'train_es.csv', 'train_it.csv']

In [3]:
train_df = pd.DataFrame()

for file in train_files:
    temp_df = pd.read_csv(os.path.join(data_root, file))
    if "en" in file:
        temp_df["bio"] = [None] * temp_df.shape[0]
    train_df = pd.concat([train_df, temp_df], ignore_index=True)

print(f"Total training samples: {train_df.shape[0]}")

Total training samples: 2988


# Sample Data

In [4]:
train_df[train_df.lang=="en"].head()

Unnamed: 0,id,text,label,lang,bio
0,en_1021,"I've never heard anyone use the word ""faggot"" ...",0,en,
1,en_1496,So you don't see the slighest problem of someb...,0,en,
2,en_1312,"And to be fair, getting triggered by slurs is ...",1,en,
3,en_469,"I kinda feel like it's saying ""the faggot comm...",0,en,
4,en_565,"Homophobia, racism, and the resulting endless ...",0,en,


In [5]:
train_df[train_df.lang=="es"].head()

Unnamed: 0,id,text,label,lang,bio
1026,es_1850,28 de Junio - D√≠a Internacional del Orgullo LG...,0,es,Doblajes Para Videojuegos que nunca tuvieron D...
1027,es_773,"@USER no me gusta la Montero, por su apoyo a l...",0,es,"Activista, sindicalista, madre y parte de la R..."
1028,es_1899,Es la semana del #GayPride y la dedicar√© al #Q...,0,es,Pintor dalt√≥nico que habla de arte. Confundo e...
1029,es_685,@USER @USER @USER A la carles vais los #TRANSF...,0,es,mujer Algemes√≠ Valencia Telegram @USER\n+34 62...
1030,es_1717,"Hoy a las 00:10 en TVE2, estreno del documenta...",0,es,Comunidad LGTBI+ sin √°nimo de lucro. Reivindic...


In [6]:
train_df[train_df.lang=="it"].head()

Unnamed: 0,id,text,label,lang,bio
1902,it_1231,La destra Italiana pur di non dire che loro od...,0,it,Il rispetto per il prossimo qualunque sia il s...
1903,it_1713,"""Presupporre che tutti i bisessuali non sono m...",0,it,ùìïùì≤ùìµùìµùìÆùì≠ ùîÄùì≤ùìΩùì± ùìØùìæùìªùîÇ ùì™ùì∑ùì≠ ùìºùìΩùì™ùìªùìªùîÇ ùìÆùîÇùìÆùì≠
1904,it_1474,"Se i diritti devono essere uguali, voglio che ...",0,it,User Experience Designer URL
1905,it_58,che poi molti uomini trans subiscono lesbofobi...,0,it,"no matter where i go, you're there ‚Ä¶"
1906,it_511,Che poi √® l‚Äôetero medio come Pio e Amedeo che ...,0,it,T'appartengo ed io ci tengo \nE se prometto po...


# Label Distribution

In [7]:
train_df['label'] = train_df['label'].astype(str)
train_df['lang']  = train_df['lang'].astype(str)
bio_valid = train_df[train_df['bio'].notna() & (train_df['bio'].astype(str).str.strip() != '')]
label_lang_counts = (
    train_df.groupby(['lang', 'label']).size().reset_index(name='count')
)
bio_label_counts = (
    bio_valid.groupby(['lang', 'label']).size().reset_index(name='bio_count')
)
lang_totals = train_df.groupby('lang').size().reset_index(name='total_count')
imbalance_df = (
    label_lang_counts.groupby('lang')['count']
    .agg(lambda x: x.min() / x.max() if x.max() != 0 else np.nan)
    .reset_index(name='imbalance_ratio')
)
merged_df = (
    label_lang_counts
    .merge(bio_label_counts, on=['lang', 'label'], how='left')
    .merge(lang_totals, on='lang', how='left')
    .merge(imbalance_df, on='lang', how='left')
    .fillna({'bio_count': 0})
)



colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
base = alt.Chart(merged_df).encode(
    x=alt.X('label:N', title='Label'),
    y=alt.Y('count:Q', title='Count'),
    color=alt.Color('label:N', scale=alt.Scale(range=colors), legend=alt.Legend(title='Label')),
    tooltip=[
        alt.Tooltip('lang:N',  title='Language'),
        alt.Tooltip('label:N', title='Label'),
        alt.Tooltip('count:Q', title='Total Samples'),
        alt.Tooltip('bio_count:Q', title='With Bio')
    ]
)
bars = base.mark_bar(opacity=0.9, cornerRadiusTopLeft=3, cornerRadiusTopRight=3)
text_labels = base.mark_text(
    dy=-6,
    fontSize=11,
    fontWeight='bold',
    color='black'
).transform_calculate(
    label_text="'Count: ' + format(datum.count, ',') + ' | Bio: ' + format(datum.bio_count, ',')"
).encode(
    text='label_text:N'
)
totals_dynamic = (
    alt.Chart(merged_df)
    .transform_aggregate(
        max_count='max(count)',
        total_count='max(total_count)',
        imbalance_ratio='max(imbalance_ratio)',
        groupby=['lang']
    )
    .transform_calculate(
        label_text="'Total: ' + format(datum.total_count, ',') + ' | Imb: ' + format(datum.imbalance_ratio, '.2f')"
    )
    .mark_text(
        fontSize=13,
        fontWeight='bold',
        color='black',
        dy=-20
    )
    .encode(
        x=alt.value(60),
        y='max_count:Q',
        text='label_text:N'
    )
)


layered = alt.layer(bars, text_labels, totals_dynamic).properties(width=120, height=300)
final_chart = (
    layered
    .facet(
        column=alt.Column('lang:N', title='Language', header=alt.Header(labelOrient='bottom'))
    )
    .resolve_scale(y='shared')
    .configure_axis(grid=False, labelAngle=0)
    .configure_view(strokeWidth=0, fill='#FAFAFA')
    .properties(title='Label Distribution by Language')
)


final_chart.save(os.path.join(figures_root, 'label_distribution.svg'))
final_chart.save(os.path.join(figures_root, 'label_distribution.pdf'))

final_chart

### Observations (Self):
* The data is heavily skewed toward class 0, indicating a strong imbalance. The presence of "bio" data varies across languages, is absent in English and uneven across other languages.

* Dominance toward one class can bias model gradients during fine-tuning and lead to underfitting for minority classes. To counter this, balancing strategies such as weighted loss functions or controlled sampling could be adopted.

* In multilingual fine-tuning, normalization across languages is essential w.r.t token distribution, sentence lengths, and special characters should be standardized. Language identifiers can help the model better separate latent representations, reducing negative transfer across languages.

* In zero-shot setups, providing balanced, language-aware prompts or few-shot examples could stabilize output distribution.

* Since "bio" metadata is only available for certain languages, missing entries replaced with a neutral token to avoid misleading the model.

# Frequency Analysis: Unique Words/Phrases

In [8]:
stop_words_dict = {
    'en': stopwords.words('english'),
    'es': stopwords.words('spanish'),
    'it': stopwords.words('italian'),
}

texts = train_df['text'].astype(str)
labels = train_df['label'].astype(int)
langs  = train_df['lang'].astype(str)

meta_df = pd.DataFrame({'lang': langs, 'label': labels, 'text': texts})

In [9]:
results = []

for lang in meta_df['lang'].unique():

    df_lang = meta_df[meta_df['lang'] == lang]
    texts_lang = df_lang['text'].values
    labels_lang = df_lang['label'].values

    if len(np.unique(labels_lang)) < 2:
        continue

    custom_stops = stop_words_dict.get(lang, [])

    vectorizer = CountVectorizer(
        lowercase=True,
        stop_words=custom_stops if len(custom_stops) > 0 else None,
        ngram_range=(1, 3),
        max_features=15000
    )

    X_lang = vectorizer.fit_transform(texts_lang)
    features = vectorizer.get_feature_names_out()

    class0_mask = labels_lang == 0
    class1_mask = labels_lang == 1

    class0_sum = X_lang[class0_mask].sum(axis=0)
    class1_sum = X_lang[class1_mask].sum(axis=0)

    freq_df = pd.DataFrame({
        'feature': features,
        'count_0': np.array(class0_sum).flatten(),
        'count_1': np.array(class1_sum).flatten()
    })

    freq_df['unique_to_0'] = (freq_df['count_0'] > 0) & (freq_df['count_1'] == 0)
    freq_df['unique_to_1'] = (freq_df['count_1'] > 0) & (freq_df['count_0'] == 0)
    freq_df['lang'] = lang

    unique_0 = freq_df[freq_df['unique_to_0']].sort_values('count_0', ascending=False).head(20)
    unique_1 = freq_df[freq_df['unique_to_1']].sort_values('count_1', ascending=False).head(20)

    results.append(pd.concat([unique_0, unique_1]))


all_unique = pd.concat(results, ignore_index=True)


for lang in all_unique['lang'].unique():
    print(f"\n============================")
    print(f"Language: {lang}")
    print("============================")
    print("\nUnique to class 0:")
    print(all_unique[(all_unique['lang'] == lang) & (all_unique['unique_to_0'])][['feature', 'count_0']].head(15))
    print("\nUnique to class 1:")
    print(all_unique[(all_unique['lang'] == lang) & (all_unique['unique_to_1'])][['feature', 'count_1']].head(15))


Language: en

Unique to class 0:
     feature  count_0
0      bitch       81
1      going       47
2      white       39
3       guys       35
4         ur       34
5         rt       32
6     school       32
7    rt user       31
8   everyone       28
9       tell       26
10     maybe       26
11      edit       26
12   nothing       24
13      come       21
14    reason       20

Unique to class 1:
               feature  count_1
20               afaik        2
21        mean hurtful        2
22    one calls faggot        2
23           one calls        2
24             partner        2
25   time take offence        2
26           time take        2
27     people actually        2
28  people allowed say        2
29        take offence        2
30              server        2
31         said either        2
32          word never        2
33         real though        2
34          word still        2

Language: es

Unique to class 0:
         feature  count_0
40        g√©nero     

### Observation (GenAI - Curated)

* In English, class 0 words such as bitch, guys, white, and school occur in broader, often neutral or unrelated contexts, indicating typical or possibly offensive everyday usage without reclamatory intent. These terms are not being reappropriated for empowerment but rather appear in ordinary or derogatory contexts. In contrast, class 1 phrases such as mean hurtful, one calls faggot, take offence, and word never suggest reflective or defensive speech - users discussing, challenging, or condemning slurs rather than using them abusively. Hence, English class 1 likely captures reclamatory or awareness-oriented discourse, where LGBTQ+ language is invoked consciously to resist stigma or discuss identity politics.

* In Spanish, class 0 terms like g√©nero, teor√≠a queer, feministas, and misoginia signal discussions in ideological or activist frameworks - conceptual and analytical, but not necessarily reclaiming slurs. They indicate neutral or advocacy discourse about gender and sexuality rather than personal reclamation. Meanwhile, class 1 includes maricones, maric√≥n, joto, and tortas ‚Äî historically derogatory terms that appear alongside orgullo (pride) and resistenciaLGBTI. Their co-occurrence shows active reappropriation of slurs within pride and resistance narratives - users are reclaiming offensive words as positive identity markers. Thus, Spanish class 1 embodies reclamatory self-reference and identity pride.

* In Italian, class 0 expressions such as persone trans, donne trans, diritti trans, and misoginia denote institutional or advocacy language around inclusion and rights - neutral or supportive but not reappropriative. Conversely, class 1 features forci, forcio, forcia, and froci ‚Äî strongly homophobic slurs. However, their co-occurrence with orgogliosamente and orgogliosa (‚Äúproudly/proud‚Äù) suggests that in some contexts they are being reclaimed as identity symbols, used with irony or empowerment. This indicates that class 1 captures intentional reappropriation, turning slurs into affirmations of LGBTQ+ identity.

* Overall, across languages, class 0 represents neutral, analytical, or descriptive usage of LGBTQ+ and gender-related vocabulary - often advocacy-related but not necessarily personal reclamation. Class 1, on the other hand, captures contextual empowerment and reappropriation, where historically offensive terms are consciously used by LGBTQ+ speakers or allies to assert identity and challenge stigma.

*In short:

‚Üí Class 0 = non-reclamatory (neutral, descriptive, analytical, or possibly offensive without reclaiming intent)

‚Üí Class 1 = reclamatory (empowering, prideful, self-referential use of formerly derogatory terms)

# Frequency Analysis: Chi-Square Feature Selection

In [10]:
results = []

for lang in meta_df['lang'].unique():

    df_lang = meta_df[meta_df['lang'] == lang]
    texts_lang = df_lang['text'].values
    labels_lang = df_lang['label'].values

    if len(np.unique(labels_lang)) < 2:
        continue

    custom_stops = stop_words_dict.get(lang, [])

    tfidf = TfidfVectorizer(
        lowercase=True,
        stop_words=custom_stops if len(custom_stops) > 0 else None,
        ngram_range=(1, 3),
        max_features=15000
    )

    X_tfidf = tfidf.fit_transform(texts_lang)
    features = tfidf.get_feature_names_out()

    chi2_scores, p_values = chi2(X_tfidf, labels_lang)

    chi2_df = pd.DataFrame({
        'feature': features,
        'chi2': chi2_scores,
        'p_value': p_values,
        'lang': lang
    })

    mask0 = (labels_lang == 0)
    mask1 = (labels_lang == 1)

    mean_tfidf_0 = np.asarray(X_tfidf[mask0].mean(axis=0)).flatten()
    mean_tfidf_1 = np.asarray(X_tfidf[mask1].mean(axis=0)).flatten()

    chi2_df['mean_tfidf_0'] = mean_tfidf_0
    chi2_df['mean_tfidf_1'] = mean_tfidf_1

    chi2_df['stronger_class'] = np.where(chi2_df['mean_tfidf_1'] > chi2_df['mean_tfidf_0'], 1, 0)

    top_features = chi2_df.sort_values('chi2', ascending=False).head(30)
    results.append(top_features)


all_chi2 = pd.concat(results, ignore_index=True)


for lang in all_chi2['lang'].unique():
    print(f"\n============================")
    print(f"Language: {lang}")
    print("============================")

    print("\nTop features for class 1:")
    print(
        all_chi2[
            (all_chi2['lang'] == lang) & (all_chi2['stronger_class'] == 1)
        ][['feature', 'chi2', 'p_value']].head(15)
    )

    print("\nTop features for class 0:")
    print(
        all_chi2[
            (all_chi2['lang'] == lang) & (all_chi2['stronger_class'] == 0)
        ][['feature', 'chi2', 'p_value']].head(15)
    )


Language: en

Top features for class 1:
               feature      chi2   p_value
0     one calls faggot  6.559004  0.010435
1            one calls  6.559004  0.010435
2           feel sorry  5.526051  0.018735
3     pride bumblebutt  5.382049  0.020345
4                laugh  5.326156  0.021008
5            reclaimed  5.305729  0.021255
6               decide  5.269440  0.021703
7              disease  4.955953  0.026001
8          real though  4.891021  0.026997
9          faggot time  4.776561  0.028850
10   make sure gayorgy  4.597050  0.032027
11   user lgbtqia cast  4.597050  0.032027
12        user lgbtqia  4.597050  0.032027
13          word never  4.491951  0.034055
14  people allowed say  4.393973  0.036066

Top features for class 0:
Empty DataFrame
Columns: [feature, chi2, p_value]
Index: []

Language: es

Top features for class 1:
                             feature      chi2   p_value
30                           maricas  6.965442  0.008310
31                      orgul

In [11]:
top_n = 15
chart_df = (
    all_chi2.sort_values(['lang', 'chi2'], ascending=[True, False])
    .groupby(['lang', 'stronger_class'])
    .head(top_n)
)

chart_df['class_label'] = chart_df['stronger_class'].map({0: 'Class 0', 1: 'Class 1'})

base = (
    alt.Chart(chart_df, width=220, height=280)
    .mark_bar(cornerRadiusTopLeft=3, cornerRadiusTopRight=3)
    .encode(
        x=alt.X('chi2:Q', title='Chi-Square Score'),
        y=alt.Y('feature:N', sort='-x', title=None),
        color=alt.Color('class_label:N', legend=alt.Legend(title='Stronger Class'), scale=alt.Scale(scheme='set2')),
        tooltip=[
            alt.Tooltip('feature:N', title='Feature'),
            alt.Tooltip('chi2:Q', title='Chi¬≤ Score', format='.2f'),
            alt.Tooltip('p_value:Q', title='p-value', format='.2e'),
            alt.Tooltip('mean_tfidf_0:Q', title='Mean TF-IDF (Class 0)', format='.4f'),
            alt.Tooltip('mean_tfidf_1:Q', title='Mean TF-IDF (Class 1)', format='.4f'),
        ]
    )
)

final_chart = (
    base
    .facet(
        column=alt.Column('lang:N', title='Language', header=alt.Header(labelOrient='bottom'))
    )
    .resolve_scale(x='independent', y='independent')
    .properties(
        title='Top Discriminative N-grams per Language (Chi-Square Feature Selection)',
        padding={"top": 20, "bottom": 20, "left": 10, "right": 10}
    )
    .configure_axis(grid=False)
    .configure_view(strokeWidth=0, fill='#FAFAFA')
)

final_chart.save(os.path.join(figures_root, "chi2_features_by_language.svg"))
final_chart.save(os.path.join(figures_root, "chi2_features_by_language.pdf"))

final_chart

### Observations (GenAI - Curated)

* In English, every significant feature belongs to class 1, suggesting that reclamatory contexts are linguistically distinctive while non-reclamatory ones lack consistent lexical markers. Phrases such as ‚Äúone calls faggot‚Äù, ‚Äúreclaimed‚Äù, ‚Äúword never‚Äù, and ‚Äúpeople allowed say‚Äù directly point to meta-discussions of slur usage. It also act as identity language about speakers debating or consciously reusing terms like faggot with awareness of their social charge. The presence of ‚Äúpride‚Äù and ‚Äúlgbtqia‚Äù reinforces this as self-referential and empowerment-oriented discourse, where slurs are framed in reclamation or educational contexts rather than insult.

* In Spanish, again all significant words cluster in class 1, dominated by ‚Äúmarica/maricones‚Äù and variants alongside ‚Äúorgullo‚Äù and ‚Äúorgullolgtbi‚Äù. This pattern indicates explicit reclamation within pride-related narratives, where historically derogatory epithets are used affectionately or defiantly within community spaces. The frequent co-occurrence with orgullo (pride) and LGTBI contexts shows that these terms are not used as insults but as affirmations of visibility and belonging. Thus, the Spanish reclamatory class is lexically marked by collective self-identification and celebration.

* In Italian, the Chi-square ranking clearly separates the two classes lexically. Class 1 is dominated by reclaimed slurs such as forci, forcio, frocia, and ricchione, often used playfully or self-referentially, and associated with expressions of pride like orgogliosamente. These are strong indicators of reclamation, where offensive heritage terms are recontextualized to express solidarity and humor. Class 0, conversely, includes more neutral identity and activism vocabulary like trans, donne, transfobia, persone which are corresponding to non-reclamatory advocacy or descriptive talk about gender rights.

* Overall, across all three languages, Chi-square analysis highlights that reclamation is lexically distinct and carried by slurs turned self-referential in pride-related or reflective contexts. The non-reclamatory usage tends to be linguistically neutral, institutional, or descriptive without explicit identity reclamation markers.