In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast

In [None]:
model_name = "/content/drive/MyDrive/Claim Span/Models/DeBERTa-multilingual"

In [None]:
val_bengali = pd.read_json("<path-to-bengali-test-data>")
val_codemix = pd.read_json("<path-to-codemix-test-data>")
val_english = pd.read_json("<path-to-english-test-data>")
val_hindi = pd.read_json("<path-to-hindi-test-data>")

In [None]:
bengali_chars = r"\u0980-\u09FF"
hindi_chars = r"\u0900-\u097F"

In [None]:
def shorten_link(text_tokens):
    p_tokens = []
    for i in range(len(text_tokens)):
        if text_tokens[i].startswith("http"):
            p_tokens.append("http")
        else:
            p_tokens.append(text_tokens[i])

    return p_tokens

In [None]:
def link_preprocess(i,text):
    text = re.sub("\n"," ",text)
    text_tokens = text.split()
    pre_tokens = shorten_link(text_tokens)
    if len(text_tokens) != len(pre_tokens):
        print(f"Error in {i}")
    p_text = " ".join(pre_tokens)
    return p_text

In [None]:
def text_preprocess(df):
    preprocessed_text = []
    for i in range(len(df)):
        text = df["text"].iloc[i]
        text = str(text) if pd.notna(text) else ""
        p_t = link_preprocess(i,text)
        preprocessed_text.append(p_t)

    return preprocessed_text

In [None]:
val_hindi["preprocessed_text"] = text_preprocess(val_hindi)
val_english["preprocessed_text"] = text_preprocess(val_english)
val_bengali["preprocessed_text"] = text_preprocess(val_bengali)
val_codemix["preprocessed_text"] = text_preprocess(val_codemix)

In [None]:
def text_cleaning(text):
#     text = re.sub(r"[^\w\s.]", "", text)
    text = re.sub(rf"[^{bengali_chars}{hindi_chars}\w\s#@']", " ", text)
    return text

In [None]:
clean_text_val_hi = []
for i in range(len(val_hindi)):
    c_text = text_cleaning(val_hindi["preprocessed_text"].iloc[i])
    clean_text_val_hi.append(c_text)

val_hindi["clean_text"] = clean_text_val_hi
val_hindi.head()

Unnamed: 0,index,claims,text,preprocessed_text,clean_text
0,6598,[एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजे...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...
1,6599,"[चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला -...","अब चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला...","अब चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला...",अब चुमार में घुसपैठ की कोशिश नाकाम भारत बोला...
2,6600,"[ड्रग्स भी लेती थीं बीफ भी खाती थी , अश्लील फो...",RT @U64277340 : ड्रग्स भी लेती थीं बीफ भी खाती...,RT @U64277340 : ड्रग्स भी लेती थीं बीफ भी खाती...,RT @U64277340 ड्रग्स भी लेती थीं बीफ भी खाती...
3,6601,[अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेने...,अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेनें...,अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेनें...,अनलॉक 4 में रेलवे चला सकता है 100 नई ट्रेनें...
4,6602,[अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं .],अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं . @U...,अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं . @U...,अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं @U...


In [None]:
clean_text_val_bn = []
for i in range(len(val_bengali)):
    c_text = text_cleaning(val_bengali["preprocessed_text"].iloc[i])
    clean_text_val_bn.append(c_text)

val_bengali["clean_text"] = clean_text_val_bn
val_bengali.head()

Unnamed: 0,id,text,claims,preprocessed_text,clean_text
0,BN1546,কৃষক বিরোধী বিজেপি দেশের লজ্জা!\n#FarmerProtes...,[কৃষক বিরোধী বিজেপি দেশের লজ্জা!\n#FarmerProte...,কৃষক বিরোধী বিজেপি দেশের লজ্জা! #FarmerProtest...,কৃষক বিরোধী বিজেপি দেশের লজ্জা #FarmerProtest...
1,BN1547,♥️\n#FarmLaws2020 #FarmersProtest #FarmBill ht...,[],♥️ #FarmLaws2020 #FarmersProtest #FarmBill http,#FarmLaws2020 #FarmersProtest #FarmBill http
2,BN1548,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...,[তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়...,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...
3,BN1549,ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...,[],ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...,ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...
4,BN1550,এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...,[],এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...,এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...


In [None]:
clean_text_val_en = []
for i in range(len(val_english)):
    c_text = text_cleaning(val_english["preprocessed_text"].iloc[i])
    clean_text_val_en.append(c_text)

val_english["clean_text"] = clean_text_val_en
val_english.head()

Unnamed: 0,index,claims,text,preprocessed_text,clean_text
0,6499,"[no vaccine will be adequately safe , as could...",@U20719141 @U55524931 We have to assume no vac...,@U20719141 @U55524931 We have to assume no vac...,@U20719141 @U55524931 We have to assume no vac...
1,6500,[screaming out for us to take Bill Gates vacci...,"@U41101034 Same as my union , screaming out fo...","@U41101034 Same as my union , screaming out fo...",@U41101034 Same as my union screaming out fo...
2,6501,[its the safest vaccine],This is why i Laff Talked all that shit went a...,This is why i Laff Talked all that shit went a...,This is why i Laff Talked all that shit went a...
3,6502,"[child developed brain abnormalities, she blam...",@U21887764 Ugh ... that is tragic . I have a c...,@U21887764 Ugh ... that is tragic . I have a c...,@U21887764 Ugh that is tragic I have a c...
4,6503,"[benefit from the 1986 vaccine law, media blac...",@U40601807 @U69515872 Heroes the direction tho...,@U40601807 @U69515872 Heroes the direction tho...,@U40601807 @U69515872 Heroes the direction tho...


In [None]:
clean_text_val_cm = []
for i in range(len(val_codemix)):
    c_text = text_cleaning(val_codemix["preprocessed_text"].iloc[i])
    clean_text_val_cm.append(c_text)

val_codemix["clean_text"] = clean_text_val_cm
val_codemix.head()

Unnamed: 0,id,text,claims,preprocessed_text,clean_text
0,CM1972,@desh_bhkt @seriousfunnyguy People stand by hi...,[],@desh_bhkt @seriousfunnyguy People stand by hi...,@desh_bhkt @seriousfunnyguy People stand by hi...
1,CM1973,#Aaj_Ka_Mudda\nदेश के किसानों का भ्रम दूर करने...,[देश के किसानों का भ्रम दूर करने के लिए देश भर...,#Aaj_Ka_Mudda देश के किसानों का भ्रम दूर करने ...,#Aaj_Ka_Mudda देश के किसानों का भ्रम दूर करने ...
2,CM1974,@abhishekaitc @BJP4India @AmitShah #PegasusSpy...,[],@abhishekaitc @BJP4India @AmitShah #PegasusSpy...,@abhishekaitc @BJP4India @AmitShah #PegasusSpy...
3,CM1975,Mam pls arrange a debate of Adam and Sahil wit...,[Mam pls arrange a debate of Adam and Sahil wi...,Mam pls arrange a debate of Adam and Sahil wit...,Mam pls arrange a debate of Adam and Sahil wit...
4,CM1976,#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...,[],#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...,#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...


In [None]:
def claim_preprocessing(claims):
    p_claims =  []
    if(claims == []):
        return p_claims
    for text in claims:
        text = re.sub("\n"," ", text)
        text = re.sub(rf"[^{bengali_chars}{hindi_chars}\w\s#@']", " ", text)
        p_claims.append(text)

    return p_claims

In [None]:
i

99

In [None]:
def claim_preprocessing_val(df):
  processed_claim_val = []
  for i in range(len(df)):
      claim = df["claims"].iloc[i]
      p_claim = claim_preprocessing(claim)
      processed_claim_val.append(p_claim)
  return processed_claim_val

val_english["preprocessed_claims"] = claim_preprocessing_val(val_english)
val_hindi["preprocessed_claims"] = claim_preprocessing_val(val_hindi)
val_bengali["preprocessed_claims"] = claim_preprocessing_val(val_bengali)
val_codemix["preprocessed_claims"] = claim_preprocessing_val(val_codemix)

In [None]:
def text_to_tokens(df):
  text_tokens_val = []
  for i in range(len(df)):
    text = df["clean_text"].iloc[i]
    text = text.strip()
    tokens = text.split()
    text_tokens_val.append(tokens)
  return text_tokens_val

text_tokens_val_hi = text_to_tokens(val_hindi)
text_tokens_val_en = text_to_tokens(val_english)
text_tokens_val_bn = text_to_tokens(val_bengali)
text_tokens_val_cm = text_to_tokens(val_codemix)

val_hindi["text_tokens"] = text_tokens_val_hi
val_english["text_tokens"] = text_tokens_val_en
val_bengali["text_tokens"] = text_tokens_val_bn
val_codemix["text_tokens"] = text_tokens_val_cm

In [None]:
def claim_tagging(text_tokens,spans):
    label = [0]*len(text_tokens)
    if(spans != []):
        for claim in spans:
            # print(claim)
            claim = claim.strip()
            claim_tokens = claim.split()
            if(len(claim_tokens) > 0):
                for i, word in enumerate(text_tokens):
                    if word == claim_tokens[0]:
                        if text_tokens[i : i + len(claim_tokens)] == claim_tokens:
                            label[i : i + len(claim_tokens)] = [1]*len(claim_tokens)
                            break

    return label

In [None]:
def calim_label_tagging(df):
  claim_label_val = []
  for i in range(len(df)):
      text_tokens = df["text_tokens"].iloc[i]
      spans = df["preprocessed_claims"].iloc[i]
      label = claim_tagging(text_tokens,spans)
      claim_label_val.append(label)
  return claim_label_val

claim_label_val_hi = calim_label_tagging(val_hindi)
claim_label_val_en = calim_label_tagging(val_english)
claim_label_val_bn = calim_label_tagging(val_bengali)
claim_label_val_cm = calim_label_tagging(val_codemix)

val_hindi["claim_label"] = claim_label_val_hi
val_english["claim_label"] = claim_label_val_en
val_bengali["claim_label"] = claim_label_val_bn
val_codemix["claim_label"] = claim_label_val_cm

In [None]:
val_hindi.head()

Unnamed: 0,index,claims,text,preprocessed_text,clean_text,preprocessed_claims,text_tokens,claim_label
0,6598,[एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजे...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...,एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजेप...,[एक नचनिया के अवैध निर्माण पर कार्रवाई से बीजे...,"[एक, नचनिया, के, अवैध, निर्माण, पर, कार्रवाई, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,6599,"[चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला -...","अब चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला...","अब चुमार में घुसपैठ की कोशिश नाकाम , भारत बोला...",अब चुमार में घुसपैठ की कोशिश नाकाम भारत बोला...,[चुमार में घुसपैठ की कोशिश नाकाम भारत बोला ...,"[अब, चुमार, में, घुसपैठ, की, कोशिश, नाकाम, भार...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,6600,"[ड्रग्स भी लेती थीं बीफ भी खाती थी , अश्लील फो...",RT @U64277340 : ड्रग्स भी लेती थीं बीफ भी खाती...,RT @U64277340 : ड्रग्स भी लेती थीं बीफ भी खाती...,RT @U64277340 ड्रग्स भी लेती थीं बीफ भी खाती...,[ड्रग्स भी लेती थीं बीफ भी खाती थी अश्लील फो...,"[RT, @U64277340, ड्रग्स, भी, लेती, थीं, बीफ, भ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,6601,[अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेने...,अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेनें...,अनलॉक - 4 में रेलवे चला सकता है 100 नई ट्रेनें...,अनलॉक 4 में रेलवे चला सकता है 100 नई ट्रेनें...,[अनलॉक 4 में रेलवे चला सकता है 100 नई ट्रेने...,"[अनलॉक, 4, में, रेलवे, चला, सकता, है, 100, नई,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,6602,[अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं .],अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं . @U...,अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं . @U...,अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं @U...,[अभी तक आरोपी लाखों लीटर ऑयल चुरा चुके हैं ],"[अभी, तक, आरोपी, लाखों, लीटर, ऑयल, चुरा, चुके,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]"


In [None]:
val_bengali.head()

Unnamed: 0,id,text,claims,preprocessed_text,clean_text,preprocessed_claims,text_tokens,claim_label
0,BN1546,কৃষক বিরোধী বিজেপি দেশের লজ্জা!\n#FarmerProtes...,[কৃষক বিরোধী বিজেপি দেশের লজ্জা!\n#FarmerProte...,কৃষক বিরোধী বিজেপি দেশের লজ্জা! #FarmerProtest...,কৃষক বিরোধী বিজেপি দেশের লজ্জা #FarmerProtest...,[কৃষক বিরোধী বিজেপি দেশের লজ্জা #FarmerProtes...,"[কৃষক, বিরোধী, বিজেপি, দেশের, লজ্জা, #FarmerPr...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]"
1,BN1547,♥️\n#FarmLaws2020 #FarmersProtest #FarmBill ht...,[],♥️ #FarmLaws2020 #FarmersProtest #FarmBill http,#FarmLaws2020 #FarmersProtest #FarmBill http,[],"[#FarmLaws2020, #FarmersProtest, #FarmBill, http]","[0, 0, 0, 0]"
2,BN1548,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...,[তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়...,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...,তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়ে...,[তাঁকে ঘিরে রাজ্য রাজনীতিতে যে আলোচনা শুরু হয়...,"[তাঁকে, ঘিরে, রাজ্য, রাজনীতিতে, যে, আলোচনা, শু...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,BN1549,ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...,[],ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...,ফের বিস্ফোরক তথাগত। এখনই দিল্লি না গেলেও জবাব ...,[],"[ফের, বিস্ফোরক, তথাগত।, এখনই, দিল্লি, না, গেলে...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,BN1550,এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...,[],এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...,এই গণতান্ত্রিকভাবে নির্বাচিত সরকার আন্দোলনকারী...,[],"[এই, গণতান্ত্রিকভাবে, নির্বাচিত, সরকার, আন্দোল...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
val_english.head()

Unnamed: 0,index,claims,text,preprocessed_text,clean_text,preprocessed_claims,text_tokens,claim_label
0,6499,"[no vaccine will be adequately safe , as could...",@U20719141 @U55524931 We have to assume no vac...,@U20719141 @U55524931 We have to assume no vac...,@U20719141 @U55524931 We have to assume no vac...,[no vaccine will be adequately safe as could...,"[@U20719141, @U55524931, We, have, to, assume,...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,6500,[screaming out for us to take Bill Gates vacci...,"@U41101034 Same as my union , screaming out fo...","@U41101034 Same as my union , screaming out fo...",@U41101034 Same as my union screaming out fo...,[screaming out for us to take Bill Gates vacci...,"[@U41101034, Same, as, my, union, screaming, o...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,6501,[its the safest vaccine],This is why i Laff Talked all that shit went a...,This is why i Laff Talked all that shit went a...,This is why i Laff Talked all that shit went a...,[its the safest vaccine],"[This, is, why, i, Laff, Talked, all, that, sh...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,6502,"[child developed brain abnormalities, she blam...",@U21887764 Ugh ... that is tragic . I have a c...,@U21887764 Ugh ... that is tragic . I have a c...,@U21887764 Ugh that is tragic I have a c...,"[child developed brain abnormalities, she blam...","[@U21887764, Ugh, that, is, tragic, I, have, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
4,6503,"[benefit from the 1986 vaccine law, media blac...",@U40601807 @U69515872 Heroes the direction tho...,@U40601807 @U69515872 Heroes the direction tho...,@U40601807 @U69515872 Heroes the direction tho...,"[benefit from the 1986 vaccine law, media blac...","[@U40601807, @U69515872, Heroes, the, directio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
val_codemix.head()

Unnamed: 0,id,text,claims,preprocessed_text,clean_text,preprocessed_claims,text_tokens,claim_label
0,CM1972,@desh_bhkt @seriousfunnyguy People stand by hi...,[],@desh_bhkt @seriousfunnyguy People stand by hi...,@desh_bhkt @seriousfunnyguy People stand by hi...,[],"[@desh_bhkt, @seriousfunnyguy, People, stand, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CM1973,#Aaj_Ka_Mudda\nदेश के किसानों का भ्रम दूर करने...,[देश के किसानों का भ्रम दूर करने के लिए देश भर...,#Aaj_Ka_Mudda देश के किसानों का भ्रम दूर करने ...,#Aaj_Ka_Mudda देश के किसानों का भ्रम दूर करने ...,[देश के किसानों का भ्रम दूर करने के लिए देश भर...,"[#Aaj_Ka_Mudda, देश, के, किसानों, का, भ्रम, दू...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,CM1974,@abhishekaitc @BJP4India @AmitShah #PegasusSpy...,[],@abhishekaitc @BJP4India @AmitShah #PegasusSpy...,@abhishekaitc @BJP4India @AmitShah #PegasusSpy...,[],"[@abhishekaitc, @BJP4India, @AmitShah, #Pegasu...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CM1975,Mam pls arrange a debate of Adam and Sahil wit...,[Mam pls arrange a debate of Adam and Sahil wi...,Mam pls arrange a debate of Adam and Sahil wit...,Mam pls arrange a debate of Adam and Sahil wit...,[Mam pls arrange a debate of Adam and Sahil wi...,"[Mam, pls, arrange, a, debate, of, Adam, and, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,CM1976,#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...,[],#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...,#डकैत_गिरोह_को_जेल_में_डालो #डकैत_को_जेल_भेजो ...,[],"[#डकैत_गिरोह_को_जेल_में_डालो, #डकैत_को_जेल_भेज...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
label_list = ["O", "Claim"]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_and_align_labels(text_tokens, claim_label):
    tokenized_inputs = tokenizer(text_tokens, truncation=True, is_split_into_words=True, max_length = 512)

    label = []
    word_ids = tokenized_inputs.word_ids()
    for word_idx in word_ids:
        if word_idx is None:
            label.append(-100)
        else:
            label.append(claim_label[word_idx])

    tokenized_inputs["label"] = label
    return tokenized_inputs

In [None]:
from tqdm.auto import tqdm

def tokenize_and_align_labels_val(df):
  val_input_ids = []
  val_attention_masks = []
  val_labels = []
  for index, row in tqdm(df.iterrows()):
      text_tokens = row["text_tokens"]
      claim_label = row["claim_label"]
      tokenized_inputs = tokenize_and_align_labels(text_tokens,claim_label)
      input = tokenized_inputs["input_ids"]
      val_input_ids.append(input)
      val_attention_masks.append(tokenized_inputs["attention_mask"])
      label = tokenized_inputs["label"]
      val_labels.append(label)
  return val_input_ids, val_attention_masks, val_labels

In [None]:
val_input_ids_en, val_attention_masks_en, val_labels_en = tokenize_and_align_labels_val(val_english)
len(val_english), len(val_input_ids_en)

0it [00:00, ?it/s]

(1500, 1500)

In [None]:
val_input_ids_hi, val_attention_masks_hi, val_labels_hi = tokenize_and_align_labels_val(val_hindi)
len(val_hindi), len(val_input_ids_hi)

0it [00:00, ?it/s]

(1500, 1500)

In [None]:
val_input_ids_bn, val_attention_masks_bn, val_labels_bn = tokenize_and_align_labels_val(val_bengali)
len(val_bengali), len(val_input_ids_bn)

0it [00:00, ?it/s]

(110, 110)

In [None]:
val_input_ids_cm, val_attention_masks_cm, val_labels_cm = tokenize_and_align_labels_val(val_codemix)
len(val_codemix), len(val_input_ids_cm)

0it [00:00, ?it/s]

(100, 100)

In [None]:
dict_val_en = {'input_ids' : val_input_ids_en, 'attention_mask' : val_attention_masks_en, 'labels' : val_labels_en}
dict_val_hi = {'input_ids' : val_input_ids_hi, 'attention_mask' : val_attention_masks_hi, 'labels' : val_labels_hi}
dict_val_bn = {'input_ids' : val_input_ids_bn, 'attention_mask' : val_attention_masks_bn, 'labels' : val_labels_bn}
dict_val_cm = {'input_ids' : val_input_ids_cm, 'attention_mask' : val_attention_masks_cm, 'labels' : val_labels_cm}

In [None]:
data_val_en = pd.DataFrame(dict_val_en)
data_val_hi = pd.DataFrame(dict_val_hi)
data_val_bn = pd.DataFrame(dict_val_bn)
data_val_cm = pd.DataFrame(dict_val_cm)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# muril_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)

In [None]:
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [None]:
from sklearn.metrics import f1_score, accuracy_score, jaccard_score, precision_score, recall_score

def compute_metrics(logits_and_labels):
      logits, labels = logits_and_labels
      preds = np.argmax(logits, axis=-1)

      # remove -100 from labels and predictions
      ground = [[t for t in label if t != -100] for label in labels]

      # do the same for predictions whenever true label is -100
      preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] for ps, ts in zip(preds, labels)]

      # flatten labels and preds
      labels_flat = flatten(ground)
      preds_flat = flatten(preds_jagged)

      acc = accuracy_score(labels_flat, preds_flat)
      f1 = f1_score(labels_flat, preds_flat, average='macro')
      pre = precision_score(labels_flat, preds_flat, average='macro')
      re = recall_score(labels_flat, preds_flat, average='macro')
      jaccard = jaccard_score(labels_flat, preds_flat, average='macro')
      return {
        'accuracy': acc,
        'precision' : pre,
        'recall' : re,
        'f1': f1,
        'jaccard': jaccard,
      }

In [None]:
import torch

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [None]:
def prediction(text_tokens):
    inputs = tokenizer(text_tokens, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True, max_length = 512)

    inputs_dict = {key: value for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs_dict)
        predictions = torch.argmax(outputs.logits, dim=-1)

    predictions = predictions[0].tolist()

    return predictions, inputs.word_ids()

In [None]:
# Converting subword predictions to word predictions using word ids

def subword_to_word_predictions(subword_predictions, word_ids):
    actual_pred_labels = []
    for w_ids, prediction in zip(word_ids, subword_predictions):
        if w_ids[-2] != None:
            allign_pred = [0]*(w_ids[-2] + 1)
            for w_idx, pred in zip(w_ids, prediction):
                if w_idx == None:
                    continue
                elif pred == 1:
                    allign_pred[w_idx] = 1
            actual_pred_labels.append(allign_pred)
        else:
            allign_pred = []
            actual_pred_labels.append(allign_pred)

    return actual_pred_labels

In [None]:
def prediction_and_alignment(df):
  predicted_labels = []
  word_ids = []
  for i in range(len(df)):
    p_labels,w_ids = prediction(df["text_tokens"].iloc[i])
    if(len(p_labels) == len(w_ids)):
      predicted_labels.append(p_labels)
      word_ids.append(w_ids)
    else:
      print(i) # Error in prediction

  predictions = subword_to_word_predictions(predicted_labels, word_ids)

  ground = df["claim_label"].tolist()

  for i in range(len(ground)):
    if(len(ground[i]) == len(predictions[i])):
        continue
    else:
      remaining_length = len(ground[i]) - len(predictions[i])
      zero_padded = [0] * remaining_length
      predictions[i] = predictions[i] + zero_padded

  df["predicted_labels"] = predictions

  return predictions, ground

In [None]:
p_ben, g_ben = prediction_and_alignment(val_bengali)
p_eng, g_eng = prediction_and_alignment(val_english)
p_hi, g_hi = prediction_and_alignment(val_hindi)
p_cm, g_cm = prediction_and_alignment(val_codemix)

In [None]:
def compute_metrics(ground, preds):
        # flatten labels and preds
      labels_flat = flatten(ground)
      preds_flat = flatten(preds)

      acc = accuracy_score(labels_flat, preds_flat)
      f1 = f1_score(labels_flat, preds_flat, average='macro')
      pre = precision_score(labels_flat, preds_flat, average='macro')
      re = recall_score(labels_flat, preds_flat, average='macro')
      jaccard = jaccard_score(labels_flat, preds_flat, average='macro')

      return acc,pre,re,f1,jaccard

In [None]:
language_list = ["Bengali", "English", "Hindi", "Codemix"]
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
jaccard_list = []

In [None]:
acc, pre, re, f1, jac = compute_metrics(g_ben, p_ben)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")

accuracy_list.append(acc)
precision_list.append(pre)
recall_list.append(re)
f1_list.append(f1)
jaccard_list.append(jac)

Accuracy = 0.7448051948051948, precision = 0.7648795487558716, recall = 0.7770143268120153, F1 = 0.7439500033634008, Jaccard = 0.5925143379442188


In [None]:
acc, pre, re, f1, jac = compute_metrics(g_eng, p_eng)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")

accuracy_list.append(acc)
precision_list.append(pre)
recall_list.append(re)
f1_list.append(f1)
jaccard_list.append(jac)

Accuracy = 0.7919538297753124, precision = 0.7853636430914153, recall = 0.7829213201488422, F1 = 0.7840420446515783, Jaccard = 0.6466966312351214


In [None]:
acc, pre, re, f1, jac = compute_metrics(g_hi, p_hi)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")

accuracy_list.append(acc)
precision_list.append(pre)
recall_list.append(re)
f1_list.append(f1)
jaccard_list.append(jac)

Accuracy = 0.8404085257548846, precision = 0.8361627138632981, recall = 0.8302137587794552, F1 = 0.8328131764985838, Jaccard = 0.7151204306268798


In [None]:
acc, pre, re, f1, jac = compute_metrics(g_cm, p_cm)

print(f"Accuracy = {acc}, precision = {pre}, recall = {re}, F1 = {f1}, Jaccard = {jac}")

accuracy_list.append(acc)
precision_list.append(pre)
recall_list.append(re)
f1_list.append(f1)
jaccard_list.append(jac)

Accuracy = 0.8316326530612245, precision = 0.7650003467927902, recall = 0.7929715097064125, F1 = 0.776789490181504, Jaccard = 0.6485290394311002


In [None]:
val_english.columns

Index(['index', 'claims', 'text', 'preprocessed_text', 'clean_text',
       'preprocessed_claims', 'text_tokens', 'claim_label',
       'predicted_labels'],
      dtype='object')

In [None]:
model_name.split("/")[-1]

'XLM-RoBERTa'

In [None]:
import os

model_name_x = model_name.split("/")[-1]
scheme = "Binary"

os.makedirs(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/Binary", exist_ok = True)

In [None]:
english_preds = val_english[['index', 'text', 'claim_label', 'predicted_labels']].copy()
hindi_preds = val_hindi[['index', 'text', 'claim_label', 'predicted_labels']].copy()
bengali_preds = val_bengali[['id', 'text', 'claim_label', 'predicted_labels']].copy()
codemix_preds = val_codemix[['id', 'text', 'claim_label', 'predicted_labels']].copy()

english_preds.to_json(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/{scheme}/english_preds.json", orient = "records")
hindi_preds.to_json(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/{scheme}/hindi_preds.json", orient = "records")
bengali_preds.to_json(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/{scheme}/bengali_preds.json", orient = "records")
codemix_preds.to_json(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/{scheme}/codemix_preds.json", orient = "records")

In [None]:
dict = {'Language' : language_list, 'Accuracy' : accuracy_list, 'Precision' : precision_list, 'Recall' : recall_list, 'F1' : f1_list, 'Jaccard' : jaccard_list}
df = pd.DataFrame(dict)
df.to_csv(f"/content/drive/MyDrive/Claim Span/Test Predictions/{model_name_x}/{scheme}/Metrics.csv", index = None)