In [None]:
#  Load libraries and data
import pandas as pd


df = pd.read_csv('data/raw_data.csv')
df.head()
df.head(), len(df)

In [None]:
#  Remove `S no` and `verified` columns
cols_to_drop = [c for c in df.columns if c in ['S no', 'verified']]
df_clean = df.drop(columns=cols_to_drop)
df_clean.head(), len(df_clean)

In [None]:
# Filter for hate == 1
df_clean = df_clean[df_clean['hate'] == 1]
df_clean.head(), len(df_clean)

In [None]:
# Remove the column named 'generalist'
df_clean = df_clean.drop(columns=['general'])
df_clean.head(), len(df_clean)

In [None]:
# Count co-occurrences across category columns
cat_cols = [c for c in df_clean.columns if c not in ['hate']]
# Convert row categories into lists
rows = df_clean[cat_cols].values.tolist()


from itertools import combinations
from collections import Counter


# Count co-occurrences across category columns (one-hot encoded)
pair_counts = Counter()

for row in df_clean[cat_cols].itertuples(index=False):
    # Pick columns where value == 1
    cats = [cat for cat, val in zip(cat_cols, row) if val == 1]

    # Count all pairs
    for a, b in combinations(sorted(cats), 2):
        pair_counts[(a, b)] += 1


# Turn counter into matrix dataframe
unique_cats = sorted({c for pair in pair_counts for c in pair})
mat = pd.DataFrame(0, index=unique_cats, columns=unique_cats)


for (a, b), count in pair_counts.items():
    mat.loc[a, b] = count
    mat.loc[b, a] = count


print(mat)


# Make same-category intersections zero
for c in unique_cats:
    mat.loc[c, c] = 0


print(mat)

In [None]:
#  Convert counts to percentages
percentage_mat = mat.copy()
col_sums = percentage_mat.sum(axis=0)
percentage_mat = (percentage_mat.divide(col_sums, axis=1).fillna(0) * 100).round(2)


percentage_mat

In [None]:
#  Export CSVs
mat.to_csv('data/matrix_raw_counts.csv', index=True)
percentage_mat.to_csv('data/matrix_percentages.csv', index=True)