In [23]:
#  Load libraries and data
import pandas as pd


df = pd.read_csv('data/raw_data.csv')
df.head()
df.head(), len(df)

(   S no  hate  queerphobic  communal  political  sexist  casteist  racist  \
 0     1     0            0         0          0       0         0       0   
 1     2     1            1         0          0       0         0       0   
 2     3     0            0         0          0       0         0       0   
 3     4     0            0         0          0       0         0       0   
 4     5     0            0         0          0       0         0       0   
 
    ablelist  general  verified  
 0       0.0      0.0       1.0  
 1       0.0      0.0       1.0  
 2       0.0      0.0       0.0  
 3       0.0      0.0       0.0  
 4       0.0      0.0       0.0  ,
 48534)

In [24]:
#  Remove `S no` and `verified` columns
cols_to_drop = [c for c in df.columns if c in ['S no', 'verified']]
df_clean = df.drop(columns=cols_to_drop)
df_clean.head(), len(df_clean)

(   hate  queerphobic  communal  political  sexist  casteist  racist  ablelist  \
 0     0            0         0          0       0         0       0       0.0   
 1     1            1         0          0       0         0       0       0.0   
 2     0            0         0          0       0         0       0       0.0   
 3     0            0         0          0       0         0       0       0.0   
 4     0            0         0          0       0         0       0       0.0   
 
    general  
 0      0.0  
 1      0.0  
 2      0.0  
 3      0.0  
 4      0.0  ,
 48534)

In [25]:
# Filter for hate == 1
df_clean = df_clean[df_clean['hate'] == 1]
df_clean.head(), len(df_clean)

(    hate  queerphobic  communal  political  sexist  casteist  racist  \
 1      1            1         0          0       0         0       0   
 26     1            0         0          0       1         0       0   
 28     1            0         0          0       1         0       0   
 29     1            0         0          0       0         1       0   
 31     1            0         1          0       1         0       0   
 
     ablelist  general  
 1        0.0      0.0  
 26       0.0      0.0  
 28       0.0      0.0  
 29       0.0      0.0  
 31       0.0      0.0  ,
 22361)

In [26]:
# Remove the column named 'generalist'
df_clean = df_clean.drop(columns=['general'])
df_clean.head(), len(df_clean)

(    hate  queerphobic  communal  political  sexist  casteist  racist  ablelist
 1      1            1         0          0       0         0       0       0.0
 26     1            0         0          0       1         0       0       0.0
 28     1            0         0          0       1         0       0       0.0
 29     1            0         0          0       0         1       0       0.0
 31     1            0         1          0       1         0       0       0.0,
 22361)

In [27]:
# Count co-occurrences across category columns
cat_cols = [c for c in df_clean.columns if c not in ['hate']]
# Convert row categories into lists
rows = df_clean[cat_cols].values.tolist()


from itertools import combinations
from collections import Counter


# Count co-occurrences across category columns (one-hot encoded)
pair_counts = Counter()

for row in df_clean[cat_cols].itertuples(index=False):
    # Pick columns where value == 1
    cats = [cat for cat, val in zip(cat_cols, row) if val == 1]

    # Count all pairs
    for a, b in combinations(sorted(cats), 2):
        pair_counts[(a, b)] += 1


# Turn counter into matrix dataframe
unique_cats = sorted({c for pair in pair_counts for c in pair})
mat = pd.DataFrame(0, index=unique_cats, columns=unique_cats)


for (a, b), count in pair_counts.items():
    mat.loc[a, b] = count
    mat.loc[b, a] = count


print(mat)


# Make same-category intersections zero
for c in unique_cats:
    mat.loc[c, c] = 0


print(mat)

             ablelist  casteist  communal  political  queerphobic  racist  \
ablelist            0        27        59        121           19      11   
casteist           27         0       176         98           71     228   
communal           59       176         0        796           77      48   
political         121        98       796          0           83      34   
queerphobic        19        71        77         83            0      30   
racist             11       228        48         34           30       0   
sexist            128       284       836        423          396     148   

             sexist  
ablelist        128  
casteist        284  
communal        836  
political       423  
queerphobic     396  
racist          148  
sexist            0  
             ablelist  casteist  communal  political  queerphobic  racist  \
ablelist            0        27        59        121           19      11   
casteist           27         0       176         98 

In [28]:
#  Convert counts to percentages
percentage_mat = mat.copy()
col_sums = percentage_mat.sum(axis=0)
percentage_mat = (percentage_mat.divide(col_sums, axis=1).fillna(0) * 100).round(2)


percentage_mat

Unnamed: 0,ablelist,casteist,communal,political,queerphobic,racist,sexist
ablelist,0.0,3.05,2.96,7.78,2.81,2.2,5.78
casteist,7.4,0.0,8.84,6.3,10.5,45.69,12.82
communal,16.16,19.91,0.0,51.19,11.39,9.62,37.74
political,33.15,11.09,39.96,0.0,12.28,6.81,19.1
queerphobic,5.21,8.03,3.87,5.34,0.0,6.01,17.88
racist,3.01,25.79,2.41,2.19,4.44,0.0,6.68
sexist,35.07,32.13,41.97,27.2,58.58,29.66,0.0


In [29]:
# Global percentage matrix ---
total_count = mat.values.sum()

percentage_global = (mat / total_count * 100).round(4)

percentage_global

Unnamed: 0,ablelist,casteist,communal,political,queerphobic,racist,sexist
ablelist,0.0,0.3298,0.7207,1.4781,0.2321,0.1344,1.5636
casteist,0.3298,0.0,2.15,1.1972,0.8673,2.7852,3.4693
communal,0.7207,2.15,0.0,9.7239,0.9406,0.5864,10.2126
political,1.4781,1.1972,9.7239,0.0,1.0139,0.4153,5.1674
queerphobic,0.2321,0.8673,0.9406,1.0139,0.0,0.3665,4.8375
racist,0.1344,2.7852,0.5864,0.4153,0.3665,0.0,1.808
sexist,1.5636,3.4693,10.2126,5.1674,4.8375,1.808,0.0


In [None]:
#  Export CSVs
mat.to_csv('data/matrix_raw_counts.csv', index=True)
percentage_mat.to_csv('data/matrix_percentages.csv', index=True)
percentage_global.to_csv("data/percentage_global.csv")

In [32]:
#export json
mat.to_json("data/counts.json", orient="split")
percentage_mat.to_json("data/percentage_column.json", orient="split")
percentage_global.to_json("data/percentage_global.json", orient="split")
