In [180]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score

In [130]:
from transformers import pipeline
model = pipeline("token-classification", model = "l3cube-pune/hing-bert-lid")

## Read Data

In [181]:
df_women = pd.read_excel("Hate Speech Women Dataset.xlsx", sheet_name="annotations")
df_religion = pd.read_excel("Hate Speech Religion Dataset.xlsx", sheet_name="annotations")
df_lgbtq = pd.read_excel("Hate Speech LGBTQ Dataset.xlsx", sheet_name="annotations")

## Annotator Agreement

In [141]:
def get_annotator_score(df):
    return cohen_kappa_score(df["label_1"], df["label_2"])

In [142]:
print(f"Women: {get_annotator_score(df_women)}")
print(f"Religion: {get_annotator_score(df_religion)}")
print(f"LGBTQ: {get_annotator_score(df_lgbtq)}")

Women: 0.7335647308708491
Religion: 0.7575166687633664
LGBTQ: 0.7199718210637548


In [182]:
round((get_annotator_score(df_women) + get_annotator_score(df_religion) + get_annotator_score(df_lgbtq))/3, 2)

0.74

## Preprocessing

In [143]:
def clean_tweet(tweet):
    # Remove usernames starting with @
    tweet_no_usernames = re.sub(r'@\w+', '@user', tweet)
    # Remove URLs
    tweet_no_urls = re.sub(r'http\S+|www\S+|\S+\.com\S+', '', tweet_no_usernames)
    return tweet_no_urls

### Language Identification

In [144]:
def get_codemix_label(df):
    ''' Whether tweets are monolingual English or not (codemixed)'''
    # Note that here codemixed includes purely Devnagari script tweets as well
    codemixed = []

    for tweet in tqdm(df["tweet"].tolist()):
        result = model(tweet)
        
        hi_score = sum([token["score"] for token in result if token["entity"] == "HI"])
        hi_score_per_token = hi_score/len(result)
        codemixed.append(1) if hi_score_per_token>0.25 else codemixed.append(0)
        
    return codemixed

In [145]:
def get_preprocessed_data(df):
    df["majority_label"] = df["majority_label"].fillna(df["label_1"])
    df = df[["id", "tweet", "majority_label"]]
    df["tweet"] = df["tweet"].apply(clean_tweet)
    df = df.rename(columns={"id":"id", "tweet":"tweet", "majority_label":"offense"})
    df["offense"] = df["offense"].apply(int)
    df["codemixed"] = get_codemix_label(df)
    return df

In [146]:
df_women = get_preprocessed_data(df_women)
df_religion = get_preprocessed_data(df_religion)
df_lgbtq = get_preprocessed_data(df_lgbtq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tweet"] = df["tweet"].apply(clean_tweet)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 278/278 [00:12<00:00, 22.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 257/257 [00:13<00:00, 18.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 106/106 [00:05<00:00, 21.08it/s]


In [147]:
df_women.head()

Unnamed: 0,id,tweet,offense,codemixed
0,1729049443876601905,@user @user @user @user @user Matlab sirf ladk...,0,1
1,1729049269456589236,Pehle main bahut loyal tha tab mujhse koi ladk...,0,1
2,1729049238007615845,Shit gharake SB aurat ko halala karate hai\n \...,0,1
3,1729049218160144888,@user Yeh kya ho raha ha... kitna ghirna ha in...,0,1
4,1729048935900262571,Suna he Ek ladki roj Twitter pe Sher Aya Sher ...,0,1


## Sanity Checks

In [153]:
df_women[df_women["codemixed"] == 0]["tweet"].tolist()[:5]

["@user @user calls him a 'Dehati Aurat'. It has been getting more apparent why with each passing day.",
 '@user #AbhishekKumar was absolutely right. She plays with others emotions. She has a boyfriend outside and she hid it like she hid it.\n #BiggBoss17 ',
 "@user Exactly. It's a crime for women and normal for heroes. 🤷🏽\u200d♀️",
 "During my College days, I used to save some money for my future by not hanging with any girl, didn't used to go to any Restaurants/ watching movies. But now wasting money for filling forms of all these Marriage bureaus for something which is never going to happen.",
 '@user @user You are just blind hater of Dhoni, So talking to a Mf like you , is just waste of time 😪']

In [155]:
df_religion[df_religion["offense"]==1]["tweet"].tolist()[:5]

['😂😂😂 abh smajh aa rha shivaji Maharaj ke Vansajon ne muslim dhram kyun swikara ...tum jeson ka inferiority complex dekh ke ...soja Bhai tere bhonkne se bhi kuch na hone vala kewal bigaad sakta hai baan nhi sakta',
 'aree tum log hutiye ho kya .. tumlogo ki poori kahani uss ladke pe kyu revolve ho rhi .. jake kashmir region me dekho 370 hatne ke baad kitne bihari bas chuke hn waha local muslims se shadi karke .. local colleges me bhi bihari bhare hue hn udhar ke',
 '😂😂 \n Aap bohot bahadur ho \n Itna to seh logi🤣🤣\n Mai to door hi rehta hu in matters se bhai kab kaun kaha peecha karta mil jaaye\n aur mere to muslim dost bhi hai kisi ne galti se dekh liya to pta nhi khud na kuch kre to kisi ko supari de dein meri😂😂',
 'toh tum akbar ke waris ho.....naya naya suslim bane ho bht tez hain tumme....converted muslim banke apne ammi ch me to phd kar li hogi',
 'Majlis tere jaise ko jawab Dena Sikha he hai. Ek number ka jahil Insaan.']

## Export

In [156]:
df_women.to_csv("cm_hate_gender.csv", index=False)
df_religion.to_csv("cm_hate_religion.csv", index=False)
df_lgbtq.to_csv("cm_hate_orientation.csv", index=False)

In [177]:
df_women["domain"] = "gender"
df_religion["domain"] = "religion"
df_lgbtq["domain"] = "orientation"
df = pd.concat([df_women, df_religion, df_lgbtq], axis=0, ignore_index=True)
df

Unnamed: 0,id,tweet,offense,codemixed,domain
0,1729049443876601905,@user @user @user @user @user Matlab sirf ladk...,0,1,gender
1,1729049269456589236,Pehle main bahut loyal tha tab mujhse koi ladk...,0,1,gender
2,1729049238007615845,Shit gharake SB aurat ko halala karate hai\n \...,0,1,gender
3,1729049218160144888,@user Yeh kya ho raha ha... kitna ghirna ha in...,0,1,gender
4,1729048935900262571,Suna he Ek ladki roj Twitter pe Sher Aya Sher ...,0,1,gender
...,...,...,...,...,...
636,1731396941114753326,@user chal bay tu mere se zyada gay hai,0,1,orientation
637,1731396133090365622,"@user Dollarsdikhao, yeh kesay bhee naachsin g...",0,1,orientation
638,1731395146665922983,@user Sorry to say that but ab logo ko khush d...,0,1,orientation
639,1731388742496096547,@user bhai tu bohot gay hai wasey,1,1,orientation


In [178]:
df.to_csv("cm_hate_combined.csv", index=False)

# Public Dataset

In [166]:
df_women_orig = pd.read_excel("Hate Speech Women Dataset.xlsx", sheet_name="annotations")
df_religion_orig = pd.read_excel("Hate Speech Religion Dataset.xlsx", sheet_name="annotations")
df_lgbtq_orig = pd.read_excel("Hate Speech LGBTQ Dataset.xlsx", sheet_name="annotations")

In [167]:
def create_public_df(df, df_orig):
    df_public = df_orig[["id", "label_1", "label_2"]].merge(df[["id", "offense"]])
    df_public["label_1"] = df_public["label_1"].apply(int)
    df_public["label_2"] = df_public["label_2"].apply(int)
    return df_public

In [168]:
df_women_public = create_public_df(df_women, df_women_orig)
df_women_public["domain"] = "gender"

df_religion_public = create_public_df(df_religion, df_religion_orig)
df_religion_public["domain"] = "religion"

df_lgbtq_public = create_public_df(df_lgbtq, df_lgbtq_orig)
df_lgbtq_public["domain"] = "orientation"

In [171]:
df_public = pd.concat([df_women_public, df_religion_public, df_lgbtq_public], axis=0, ignore_index=True)
df_public = df_public[["id", "domain", "label_1", "label_2", "offense"]]
df_public = df_public.rename(columns={"id":"tweet_id", "domain":"domain", 
                                      "label_1":"label_1", "label_2":"label_2", "offense":"majority_label"
                                     })
df_public

Unnamed: 0,tweet_id,domain,label_1,label_2,majority_label
0,1729049443876601905,gender,0,0,0
1,1729049269456589236,gender,0,0,0
2,1729049238007615845,gender,0,0,0
3,1729049218160144888,gender,0,0,0
4,1729048935900262571,gender,0,0,0
...,...,...,...,...,...
636,1731396941114753326,orientation,1,0,0
637,1731396133090365622,orientation,1,0,0
638,1731395146665922983,orientation,0,0,0
639,1731388742496096547,orientation,1,0,1


In [172]:
df_public.to_csv("hinglish_hatespeech.csv", index=False)

In [175]:
label_cnts = df_public["majority_label"].value_counts()
label_cnts/label_cnts.sum()

0    0.564743
1    0.435257
Name: majority_label, dtype: float64

## Language Identification Test (Hinglish)

In [17]:
tweets = df1["tweet"].tolist()

In [16]:
from transformers import pipeline

model = pipeline("token-classification", model = "l3cube-pune/hing-bert-lid")
result = model("yeh model chalta hai ki nai?")
result

[{'entity': 'HI',
  'score': 0.9861215,
  'index': 1,
  'word': 'ye',
  'start': 0,
  'end': 2},
 {'entity': 'HI',
  'score': 0.76267576,
  'index': 2,
  'word': '##h',
  'start': 2,
  'end': 3},
 {'entity': 'EN',
  'score': 0.99998903,
  'index': 3,
  'word': 'model',
  'start': 4,
  'end': 9},
 {'entity': 'HI',
  'score': 0.99491715,
  'index': 4,
  'word': 'cha',
  'start': 10,
  'end': 13},
 {'entity': 'HI',
  'score': 0.7841624,
  'index': 5,
  'word': '##lta',
  'start': 13,
  'end': 16},
 {'entity': 'HI',
  'score': 0.98446053,
  'index': 6,
  'word': 'hai',
  'start': 17,
  'end': 20},
 {'entity': 'HI',
  'score': 0.97508156,
  'index': 7,
  'word': 'ki',
  'start': 21,
  'end': 23},
 {'entity': 'HI',
  'score': 0.8099531,
  'index': 8,
  'word': 'na',
  'start': 24,
  'end': 26},
 {'entity': 'EN',
  'score': 0.85855985,
  'index': 9,
  'word': '##i',
  'start': 26,
  'end': 27},
 {'entity': 'EN',
  'score': 0.9989041,
  'index': 10,
  'word': '?',
  'start': 27,
  'end': 28}]

In [21]:
model(tweets[43])

[{'entity': 'HI',
  'score': 0.80155873,
  'index': 1,
  'word': 'ए',
  'start': 0,
  'end': 1},
 {'entity': 'EN',
  'score': 0.6006054,
  'index': 2,
  'word': '##क',
  'start': 1,
  'end': 2},
 {'entity': 'HI',
  'score': 0.99878865,
  'index': 3,
  'word': 'तरफ',
  'start': 3,
  'end': 6},
 {'entity': 'HI',
  'score': 0.6254365,
  'index': 4,
  'word': 'न',
  'start': 7,
  'end': 8},
 {'entity': 'HI',
  'score': 0.9868537,
  'index': 5,
  'word': '##ा',
  'start': 8,
  'end': 9},
 {'entity': 'HI',
  'score': 0.7897336,
  'index': 6,
  'word': '##र',
  'start': 9,
  'end': 10},
 {'entity': 'HI',
  'score': 0.9651025,
  'index': 7,
  'word': '##ी',
  'start': 10,
  'end': 11},
 {'entity': 'EN',
  'score': 0.62098646,
  'index': 8,
  'word': 'क',
  'start': 12,
  'end': 13},
 {'entity': 'HI',
  'score': 0.98527026,
  'index': 9,
  'word': 'स',
  'start': 15,
  'end': 16},
 {'entity': 'HI',
  'score': 0.93497354,
  'index': 10,
  'word': '##म',
  'start': 16,
  'end': 17},
 {'entity': '

In [43]:
hi_scores = []
en_scores = []
hi_counts = []
total_counts = []

for tweet in tqdm(tweets):
    tweet = clean_tweet(tweet)
    result = model(tweet)
    
    hi = [token["score"] for token in result if token["entity"] == "HI"]
    en = [token["score"] for token in result if token["entity"] == "EN"]
    
    hi_scores.append(sum(hi))
    en_scores.append(sum(en))
    hi_counts.append(len(hi))
    total_counts.append(len(result))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 278/278 [00:17<00:00, 15.50it/s]


In [88]:
df1["hi_score"] = hi_scores
df1["en_score"] = en_scores
df1["hi_count"] = hi_counts
df1["total_count"] = total_counts
df1["hi_score_per_token"] = df1["hi_score"]/df1["total_count"]

df1["codemixed"] = df1["hi_count"]/df1["total_count"] > 0.25 # Not Monolingual English

# df1["monolingual_english"] = df1["hi_count"]/df1["total_count"] < 0.125
# df1["monolingual_hindi"] = df1["hi_count"]/df1["total_count"] > 1-0.125 # Identifies Hinglish words as HI
# df1["codemixed"] = ~(df1["monolingual_english"] | df1["monolingual_hindi"])

In [89]:
df1[~df1["codemixed"]]#["tweet"].values

Unnamed: 0,id,tweet,label,hindi,english,hi_score,en_score,hi_count,total_count,codemixed,monolingual_english,monolingual_hindi,hi_score_per_token
31,1729045129699443057,@IndianTintin_ @mediacrooks calls him a 'Dehat...,1.0,4.624641,17.991381,4.624641,17.991381,5,23,False,False,False,0.201071
85,1729036502095761839,@Shiva_Veeramas Exactly. It's a crime for wome...,0.0,1.708471,12.989679,1.708471,12.989679,2,15,False,False,False,0.113898
151,1729027072184787057,"During my College days, I used to save some mo...",0.0,0.850821,54.514729,0.850821,54.514729,1,56,False,True,False,0.015193
209,1729014409908085188,@Pagal_aurat @mufaddal_vohra You are just blin...,1.0,1.895501,22.921377,1.895501,22.921377,2,25,False,True,False,0.07582
233,1729207267982278883,@preachaaaa Constructive criticism: try to not...,0.0,2.989354,46.571347,2.989354,46.571347,3,50,False,True,False,0.059787
237,1729191664508670325,The All India Trinamool Congress under the gui...,0.0,7.636111,30.964689,7.636111,30.964689,8,39,False,False,False,0.195798
266,1729046371582525732,Hello dear Randi 😊😊🤗🤗\n @Randirobics\n Good Mo...,0.0,10.191863,45.753986,10.191863,45.753986,11,58,False,False,False,0.175722
269,1725819588166762557,@SVOjha the verna system describe one in 4 cat...,0.0,12.20859,50.565623,12.20859,50.565623,15,67,False,False,False,0.182218


In [94]:
df1.sort_values(by="hi_score_per_token")["tweet"][:20].tolist()

["During my College days, I used to save some money for my future by not hanging with any girl, didn't used to go to any Restaurants/ watching movies. But now wasting money for filling forms of all these Marriage bureaus for something which is never going to happen.",
 '@preachaaaa Constructive criticism: try to not use word "Aurat" in your discourse because it is etymologically derogatory , as it means female genitals in Arabic.\n You can use alternate words like Stree for women or Mahila.',
 '@Pagal_aurat @mufaddal_vohra You are just blind hater of Dhoni, So talking to a Mf like you , is just waste of time 😪',
 "@Shiva_Veeramas Exactly. It's a crime for women and normal for heroes. 🤷🏽\u200d♀️",
 'Hello dear Randi 😊😊🤗🤗\n @Randirobics\n Good Morning ☕️🥐⚘️⚘️\n Hope you had a beautiful purple 💜 Weekend.\n Have you and yours a fabulous 💜 week ahead.\n Enjoy your Monday as much as possible.\n My best wishes. Take good care.\n Hi Rickles! 🤗🤗\n #MondayMorning #MondayThoughts',
 '@SVOjha the 

In [93]:
df1.sort_values(by="hi_score_per_token").iloc[:15]#["tweet"][:20].tolist()

Unnamed: 0,id,tweet,label,hindi,english,hi_score,en_score,hi_count,total_count,codemixed,monolingual_english,monolingual_hindi,hi_score_per_token
151,1729027072184787057,"During my College days, I used to save some mo...",0.0,0.850821,54.514729,0.850821,54.514729,1,56,False,True,False,0.015193
233,1729207267982278883,@preachaaaa Constructive criticism: try to not...,0.0,2.989354,46.571347,2.989354,46.571347,3,50,False,True,False,0.059787
209,1729014409908085188,@Pagal_aurat @mufaddal_vohra You are just blin...,1.0,1.895501,22.921377,1.895501,22.921377,2,25,False,True,False,0.07582
85,1729036502095761839,@Shiva_Veeramas Exactly. It's a crime for wome...,0.0,1.708471,12.989679,1.708471,12.989679,2,15,False,False,False,0.113898
266,1729046371582525732,Hello dear Randi 😊😊🤗🤗\n @Randirobics\n Good Mo...,0.0,10.191863,45.753986,10.191863,45.753986,11,58,False,False,False,0.175722
269,1725819588166762557,@SVOjha the verna system describe one in 4 cat...,0.0,12.20859,50.565623,12.20859,50.565623,15,67,False,False,False,0.182218
237,1729191664508670325,The All India Trinamool Congress under the gui...,0.0,7.636111,30.964689,7.636111,30.964689,8,39,False,False,False,0.195798
31,1729045129699443057,@IndianTintin_ @mediacrooks calls him a 'Dehat...,1.0,4.624641,17.991381,4.624641,17.991381,5,23,False,False,False,0.201071
238,1729189719903478238,Time to ban the word “Aurat” in India. \n Use ...,0.0,7.247824,22.398405,7.247824,22.398405,8,31,True,False,False,0.233801
35,1728675716680892892,@Akash7988Devil #AbhishekKumar was absolutely ...,1.0,8.842888,25.764855,8.842888,25.764855,10,36,True,False,False,0.245636
