In [1]:
import pandas as pd
from IPython.display import display
from pathlib import Path
import os
from zipfile import ZipFile

In [2]:
TEAMNAME="3Idiots"
LANGNAME_MAPS = {
    "ENG": "english",
    "IBEN": "bengali",
    "HIN": "hindi"
}
PREDICTION_FILENAME_TEMPLATE="{teamname}_{lang}_{task}_run_{run_id}.csv"
DESCRIPTION_FILENAME_TEMPLATE="{teamname}_{lang}_{task}_run_{run_id}.txt"
ZIP_FILENAME_TEMPLATE="{teamname}_{lang}_{task}_run_{run_id}.zip"

In [3]:
def add_marginalized_predictions_for_subtask_C(df):
    column_names=df.filter(like='_probs', axis=1).columns.tolist()
    listA=list(set([
        x.split('-')[0] 
        for x in column_names]))
    listB=list(set([
        x[x.find('-')+1:x.find('_')] 
        for x in column_names]))
    for label in listA:
        filter_str=r'^{}-'.format(label)
        df[label+'_probs']=df.filter(regex=filter_str, axis=1).sum(axis=1)
    for label in listB:
        filter_str="-{}".format(label)
        df[label+'_probs']=df.filter(regex=filter_str, axis=1).sum(axis=1)
    prob_listA=[x+'_probs' for x in listA]
    prob_listB=[x+'_probs' for x in listB]
    
    df['Sub-task B_preds']=df[prob_listB].idxmax(axis=1).apply(
        lambda x: pd.Series(str(x).split('_'))[0])
    df['Sub-task A_preds']=df[prob_listA].idxmax(axis=1).apply(
        lambda x: pd.Series(str(x).split('_'))[0])
    df['Sub-task C_mpreds'] = df['Sub-task A_preds'].str.cat(df['Sub-task B_preds'], "-")
    return df

In [4]:
def generate_description_file(
    lang, model, task, run_id, data_key="test",
    use_task_C=False, use_task_C_marginalized=False,
    use_all_lang=False
):
    submission_data_dir = os.path.join("../run_submissions/", lang, model)
    os.makedirs(submission_data_dir, exist_ok=True)
    
    subtask_c_marginalized_description = f"""For this model the marginal probabilities of labels for subtask {task} are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask."""
    
    subtask_c_no_marginalized_description = f"""
The label with the highest probability is selected as the predicted label. 
The subtask {task} component of this predicted label is used as a prediction for this subtask."""
    
    subtask_c_description = f"""For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

{subtask_c_marginalized_description if use_task_C_marginalized else subtask_c_no_marginalized_description}
"""
    
    no_subtask_c_description = f"""This model was trained to predict labels for subtask {task}."""
    
    all_lang_description = f"""This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.
"""
    
    single_lang_description = f"""Only data from {LANGNAME_MAPS[lang]} language was used for training.""" 
    
    
    description = f"""This is submission {run_id} for language {LANGNAME_MAPS[lang]} for Sub-task {task}.
The submission was generated using the {model} transformer model. 
The model only used the text as a feature for prediction. 

{all_lang_description if use_all_lang else single_lang_description}

{subtask_c_description if use_task_C else no_subtask_c_description}

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020
"""
    
    filename = DESCRIPTION_FILENAME_TEMPLATE.format(
        teamname=TEAMNAME, lang=LANGNAME_MAPS[lang], task=task, run_id=run_id
    )
    description_file = Path(os.path.join(submission_data_dir, filename))
    
    with open(description_file, "w+") as fp:
        print(description)
        print(description, file=fp)
        
    return Path(description_file)

In [5]:
def generate_prediction_file(
    lang, model, task, run_id, data_key="test",
    use_task_C=False, use_task_C_marginalized=False,
    use_all_lang=False
):
    """Skip tasks is {task_id1, task_id2}"""
    task_orders = {"A": 0, "B": 1}
    task_order = task_orders[task]
    task = f"Sub-task {task}"
    
    raw_file_path = f"../data/raw/{lang.lower()}/trac2_{lang.lower()}_{data_key}.csv"
    df = pd.read_csv(raw_file_path, sep=",")
    submission_data_dir = os.path.join("../run_submissions/", lang, model)
    os.makedirs(submission_data_dir, exist_ok=True)

    pred_file_path = "../{lang}/{{task}}/output/{model}/{data_key}.tsv".format(
        lang="ALL" if use_all_lang else lang, model=model, data_key=data_key
    )
    #pred_file_path = f"../{lang}/{task}/outputs/{model}/{data_key}.tsv"
    if use_task_C:
        pred_file_path = pred_file_path.format(task="Sub-task C")
    else:
        pred_file_path = pred_file_path.format(task=task)
    
    print(raw_file_path, pred_file_path)
    df_pred = pd.read_csv(pred_file_path, sep="\t")
    if use_task_C and use_task_C_marginalized:
        df_pred = add_marginalized_predictions_for_subtask_C(df_pred)
    if use_all_lang:
        df_pred = df_pred[df_pred["id"].str.split("-", expand=True)[0] == lang].copy().assign(
            id=df_pred["id"].str.split("-", n=1, expand=True)[0]
        ).reset_index(drop=True)
    display(df_pred.head())
    print(df_pred.shape)
    assert df.shape[0] == df_pred.shape[0], f"Shape mismatch: {df.shape[0]} != {df_pred.shape[0]}"
    pred_label = f"{task}_preds"
    if use_task_C:
        if use_task_C_marginalized:
            df[pred_label] = df_pred["Sub-task C_mpreds"].str.split("-", expand=True)[task_order]
        else:
            df[pred_label] = df_pred["Sub-task C_preds"].str.split("-", expand=True)[task_order]
    else:
        df[pred_label] = df_pred[pred_label]
    display(df.head())
    display(df[pred_label].value_counts().to_frame().assign(
        proportion=lambda x: x[pred_label]*100./x[pred_label].sum()
    ))
    filename = PREDICTION_FILENAME_TEMPLATE.format(
        teamname=TEAMNAME, lang=LANGNAME_MAPS[lang], task=task.split()[1], run_id=run_id
    )
    prediction_file = Path(os.path.join(submission_data_dir, filename))
    df[["ID", pred_label]].rename(
        columns={pred_label: "label"}
    ).to_csv(prediction_file, sep=",", index=False)
    return prediction_file

In [6]:
def write_run_submission_files(
    lang, model, task, run_id, data_key="test",
    use_task_C=False, use_task_C_marginalized=False,
    use_all_lang=False
):
    submission_data_dir = os.path.join("../run_submissions/", lang, model)
    os.makedirs(submission_data_dir, exist_ok=True)
    prediction_file = generate_prediction_file(
        lang, model, task, run_id, data_key=data_key,
        use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
        use_all_lang=use_all_lang
    )
    description_file = generate_description_file(
        lang, model, task, run_id, data_key=data_key,
        use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
        use_all_lang=use_all_lang
    )
    filename = ZIP_FILENAME_TEMPLATE.format(
        teamname=TEAMNAME, lang=LANGNAME_MAPS[lang], task=task, run_id=run_id
    )
    zip_file = Path(os.path.join(submission_data_dir, filename))
    
    with ZipFile(zip_file, mode="w") as zfp:
        zfp.write(prediction_file, "predictions.csv")
        zfp.write(description_file, "description.txt")
    return (zip_file, prediction_file, description_file)

# English

## Subtask A

In [7]:
run_id=1
lang="ENG"
model="bert-base-multilingual-uncased"
task="A"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ALL/Sub-task A/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task A_preds,OAG_probs,NAG_probs,CAG_probs
0,ENG,OAG,NAG,0.001645,0.99742,0.000935
1,ENG,OAG,CAG,0.029705,0.288204,0.682091
2,ENG,OAG,NAG,0.015609,0.521193,0.463198
3,ENG,OAG,NAG,0.001297,0.978014,0.020689
4,ENG,OAG,NAG,0.003868,0.977683,0.018448


(1200, 6)


Unnamed: 0,ID,Text,Sub-task A_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NAG
1,C37.1,shame on society & culture trying to justify t...,CAG
2,C37.2,"just wanna say, there is no gay gene. you can ...",NAG
3,C37.3,"well, if a one twin struggles with self confid...",NAG
4,C37.4,i am an identical twin and we are both homosex...,NAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,773,64.416667
OAG,258,21.5
CAG,169,14.083333


This is submission 1 for language english for Sub-task A.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


This model was trained to predict labels for subtask A.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_A_run_1.zip'),
 WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_A_run_1.csv'),
 WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_A_run_1.txt'))

In [8]:
run_id=2
lang="ENG"
model="bert-base-uncased"
task="A"
use_task_C=True
use_task_C_marginalized=False
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ENG/Sub-task C/output/bert-base-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs
0,0,OAG-GEN,NAG-NGEN,0.001567,0.000765,0.001478,0.992979,0.000813,0.002397
1,1,OAG-GEN,CAG-NGEN,0.006747,0.066613,0.010236,0.048054,0.015565,0.852784
2,2,OAG-GEN,NAG-GEN,0.074324,0.043425,0.733557,0.020363,0.077365,0.050965
3,3,OAG-GEN,NAG-GEN,0.081505,0.048429,0.713062,0.017402,0.087117,0.052485
4,4,OAG-GEN,NAG-GEN,0.069995,0.034533,0.754627,0.040436,0.063511,0.036898


(1200, 9)


Unnamed: 0,ID,Text,Sub-task A_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NAG
1,C37.1,shame on society & culture trying to justify t...,CAG
2,C37.2,"just wanna say, there is no gay gene. you can ...",NAG
3,C37.3,"well, if a one twin struggles with self confid...",NAG
4,C37.4,i am an identical twin and we are both homosex...,NAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,754,62.833333
OAG,228,19.0
CAG,218,18.166667


This is submission 2 for language english for Sub-task A.
The submission was generated using the bert-base-uncased transformer model. 
The model only used the text as a feature for prediction. 

Only data from english language was used for training.

For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 


The label with the highest probability is selected as the predicted label. 
The subtask A component of this predicted label is used as a prediction for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_2.zip'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_2.csv'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_2.txt'))

In [9]:
run_id=3
lang="ENG"
model="bert-base-uncased"
task="A"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ENG/Sub-task C/output/bert-base-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,0,OAG-GEN,NAG-NGEN,0.001567,0.000765,0.001478,0.992979,0.000813,0.002397,0.00321,0.002333,0.994458,0.996142,0.003859,NGEN,NAG,NAG-NGEN
1,1,OAG-GEN,CAG-NGEN,0.006747,0.066613,0.010236,0.048054,0.015565,0.852784,0.868349,0.07336,0.058291,0.967452,0.032548,NGEN,CAG,CAG-NGEN
2,2,OAG-GEN,NAG-GEN,0.074324,0.043425,0.733557,0.020363,0.077365,0.050965,0.12833,0.117749,0.753921,0.114753,0.885247,GEN,NAG,NAG-GEN
3,3,OAG-GEN,NAG-GEN,0.081505,0.048429,0.713062,0.017402,0.087117,0.052485,0.139602,0.129934,0.730464,0.118316,0.881684,GEN,NAG,NAG-GEN
4,4,OAG-GEN,NAG-GEN,0.069995,0.034533,0.754627,0.040436,0.063511,0.036898,0.100409,0.104528,0.795063,0.111866,0.888134,GEN,NAG,NAG-GEN


(1200, 17)


Unnamed: 0,ID,Text,Sub-task A_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NAG
1,C37.1,shame on society & culture trying to justify t...,CAG
2,C37.2,"just wanna say, there is no gay gene. you can ...",NAG
3,C37.3,"well, if a one twin struggles with self confid...",NAG
4,C37.4,i am an identical twin and we are both homosex...,NAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,751,62.583333
OAG,234,19.5
CAG,215,17.916667


This is submission 3 for language english for Sub-task A.
The submission was generated using the bert-base-uncased transformer model. 
The model only used the text as a feature for prediction. 

Only data from english language was used for training.

For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask A are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_3.zip'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_3.csv'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_A_run_3.txt'))

## Subtask B

In [10]:
run_id=1
lang="ENG"
model="bert-base-uncased"
task="B"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ENG/Sub-task C/output/bert-base-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,0,OAG-GEN,NAG-NGEN,0.001567,0.000765,0.001478,0.992979,0.000813,0.002397,0.00321,0.002333,0.994458,0.996142,0.003859,NGEN,NAG,NAG-NGEN
1,1,OAG-GEN,CAG-NGEN,0.006747,0.066613,0.010236,0.048054,0.015565,0.852784,0.868349,0.07336,0.058291,0.967452,0.032548,NGEN,CAG,CAG-NGEN
2,2,OAG-GEN,NAG-GEN,0.074324,0.043425,0.733557,0.020363,0.077365,0.050965,0.12833,0.117749,0.753921,0.114753,0.885247,GEN,NAG,NAG-GEN
3,3,OAG-GEN,NAG-GEN,0.081505,0.048429,0.713062,0.017402,0.087117,0.052485,0.139602,0.129934,0.730464,0.118316,0.881684,GEN,NAG,NAG-GEN
4,4,OAG-GEN,NAG-GEN,0.069995,0.034533,0.754627,0.040436,0.063511,0.036898,0.100409,0.104528,0.795063,0.111866,0.888134,GEN,NAG,NAG-GEN


(1200, 17)


Unnamed: 0,ID,Text,Sub-task B_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NGEN
1,C37.1,shame on society & culture trying to justify t...,NGEN
2,C37.2,"just wanna say, there is no gay gene. you can ...",GEN
3,C37.3,"well, if a one twin struggles with self confid...",GEN
4,C37.4,i am an identical twin and we are both homosex...,GEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,1002,83.5
GEN,198,16.5


This is submission 1 for language english for Sub-task B.
The submission was generated using the bert-base-uncased transformer model. 
The model only used the text as a feature for prediction. 

Only data from english language was used for training.

For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask B are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_B_run_1.zip'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_B_run_1.csv'),
 WindowsPath('../run_submissions/ENG/bert-base-uncased/3Idiots_english_B_run_1.txt'))

In [11]:
run_id=2
lang="ENG"
model="xlm-roberta-base"
task="B"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ALL/Sub-task B/output/xlm-roberta-base/test.tsv


Unnamed: 0,id,label,Sub-task B_preds,GEN_probs,NGEN_probs
0,ENG,GEN,NGEN,0.005348,0.994652
1,ENG,GEN,NGEN,0.009575,0.990425
2,ENG,GEN,GEN,0.543342,0.456658
3,ENG,GEN,GEN,0.819391,0.180609
4,ENG,GEN,GEN,0.846443,0.153557


(1200, 5)


Unnamed: 0,ID,Text,Sub-task B_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NGEN
1,C37.1,shame on society & culture trying to justify t...,NGEN
2,C37.2,"just wanna say, there is no gay gene. you can ...",GEN
3,C37.3,"well, if a one twin struggles with self confid...",GEN
4,C37.4,i am an identical twin and we are both homosex...,GEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,979,81.583333
GEN,221,18.416667


This is submission 2 for language english for Sub-task B.
The submission was generated using the xlm-roberta-base transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


This model was trained to predict labels for subtask B.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/ENG/xlm-roberta-base/3Idiots_english_B_run_2.zip'),
 WindowsPath('../run_submissions/ENG/xlm-roberta-base/3Idiots_english_B_run_2.csv'),
 WindowsPath('../run_submissions/ENG/xlm-roberta-base/3Idiots_english_B_run_2.txt'))

In [12]:
run_id=3
lang="ENG"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/eng/trac2_eng_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,ENG,OAG-GEN,NAG-NGEN,0.000729,0.00066,0.001427,0.995357,0.000515,0.001311,0.001827,0.00139,0.996784,0.997329,0.002671,NGEN,NAG,NAG-NGEN
1,ENG,OAG-GEN,CAG-NGEN,0.007363,0.353448,0.011852,0.11536,0.00741,0.504567,0.511977,0.360811,0.127212,0.973375,0.026625,NGEN,CAG,CAG-NGEN
2,ENG,OAG-GEN,NAG-GEN,0.023969,0.009367,0.831679,0.019565,0.101936,0.013484,0.11542,0.033336,0.851244,0.042417,0.957583,GEN,NAG,NAG-GEN
3,ENG,OAG-GEN,NAG-GEN,0.020242,0.009761,0.725028,0.07545,0.127213,0.042306,0.169519,0.030003,0.800478,0.127518,0.872482,GEN,NAG,NAG-GEN
4,ENG,OAG-GEN,NAG-GEN,0.019885,0.007941,0.778953,0.098924,0.072305,0.021993,0.094298,0.027825,0.877877,0.128858,0.871142,GEN,NAG,NAG-GEN


(1200, 17)


Unnamed: 0,ID,Text,Sub-task B_preds
0,YoutubeCorpus37,https://www.youtube.com/watch?v=4iejjszkflo,NGEN
1,C37.1,shame on society & culture trying to justify t...,NGEN
2,C37.2,"just wanna say, there is no gay gene. you can ...",GEN
3,C37.3,"well, if a one twin struggles with self confid...",GEN
4,C37.4,i am an identical twin and we are both homosex...,GEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,980,81.666667
GEN,220,18.333333


This is submission 3 for language english for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask B are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github

(WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_B_run_3.zip'),
 WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_B_run_3.csv'),
 WindowsPath('../run_submissions/ENG/bert-base-multilingual-uncased/3Idiots_english_B_run_3.txt'))

## Hindi

## Subtask A

In [14]:
run_id=1
lang="HIN"
model="bert-base-multilingual-uncased"
task="A"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../HIN/Sub-task A/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task A_preds,OAG_probs,NAG_probs,CAG_probs
0,0,OAG,NAG,0.033971,0.939872,0.026157
1,1,OAG,NAG,0.003384,0.990474,0.006142
2,2,OAG,NAG,0.006885,0.987561,0.005554
3,3,OAG,NAG,0.003141,0.991641,0.005218
4,4,OAG,NAG,0.003041,0.985475,0.011484


(1200, 6)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.17,ko,NAG
1,C52.39,ladkiyon video,NAG
2,C52.73,ki video gahrep,NAG
3,C60.3,o sadharon video bhai,NAG
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NAG


Unnamed: 0,Sub-task A_preds,proportion
OAG,660,55.0
NAG,297,24.75
CAG,243,20.25


This is submission 1 for language hindi for Sub-task A.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

Only data from hindi language was used for training.

This model was trained to predict labels for subtask A.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_1.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_1.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_1.txt'))

In [15]:
run_id=2
lang="HIN"
model="bert-base-multilingual-uncased"
task="A"
use_task_C=True
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs
0,HIN,OAG-GEN,NAG-NGEN,0.000897,0.000871,0.001207,0.994551,0.000499,0.001975
1,HIN,OAG-GEN,NAG-NGEN,0.001878,0.004767,0.005382,0.76364,0.004214,0.220119
2,HIN,OAG-GEN,NAG-NGEN,0.002114,0.007861,0.001312,0.907107,0.001396,0.08021
3,HIN,OAG-GEN,NAG-NGEN,0.000562,0.00058,0.001241,0.995654,0.000435,0.001527
4,HIN,OAG-GEN,NAG-NGEN,0.000577,0.00066,0.001496,0.984942,0.000766,0.011559


(1200, 9)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.17,ko,NAG
1,C52.39,ladkiyon video,NAG
2,C52.73,ki video gahrep,NAG
3,C60.3,o sadharon video bhai,NAG
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NAG


Unnamed: 0,Sub-task A_preds,proportion
OAG,689,57.416667
NAG,332,27.666667
CAG,179,14.916667


This is submission 2 for language hindi for Sub-task A.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 


The label with the highest probability is selected as the predicted label. 
The subtask A component of this predicted label is used as a prediction for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC202

(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_2.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_2.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_2.txt'))

In [16]:
run_id=3
lang="HIN"
model="bert-base-multilingual-uncased"
task="A"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,HIN,OAG-GEN,NAG-NGEN,0.000897,0.000871,0.001207,0.994551,0.000499,0.001975,0.002473,0.001769,0.995758,0.997397,0.002603,NGEN,NAG,NAG-NGEN
1,HIN,OAG-GEN,NAG-NGEN,0.001878,0.004767,0.005382,0.76364,0.004214,0.220119,0.224333,0.006645,0.769022,0.988525,0.011475,NGEN,NAG,NAG-NGEN
2,HIN,OAG-GEN,NAG-NGEN,0.002114,0.007861,0.001312,0.907107,0.001396,0.08021,0.081606,0.009975,0.908419,0.995178,0.004822,NGEN,NAG,NAG-NGEN
3,HIN,OAG-GEN,NAG-NGEN,0.000562,0.00058,0.001241,0.995654,0.000435,0.001527,0.001962,0.001143,0.996895,0.997761,0.002238,NGEN,NAG,NAG-NGEN
4,HIN,OAG-GEN,NAG-NGEN,0.000577,0.00066,0.001496,0.984942,0.000766,0.011559,0.012325,0.001238,0.986438,0.997162,0.002839,NGEN,NAG,NAG-NGEN


(1200, 17)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.17,ko,NAG
1,C52.39,ladkiyon video,NAG
2,C52.73,ki video gahrep,NAG
3,C60.3,o sadharon video bhai,NAG
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NAG


Unnamed: 0,Sub-task A_preds,proportion
OAG,692,57.666667
NAG,331,27.583333
CAG,177,14.75


This is submission 3 for language hindi for Sub-task A.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask A are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github.c

(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_3.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_3.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_A_run_3.txt'))

## Subtask B

In [17]:
run_id=1
lang="HIN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../HIN/Sub-task B/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task B_preds,GEN_probs,NGEN_probs
0,0,GEN,NGEN,0.040602,0.959398
1,1,GEN,NGEN,0.005196,0.994804
2,2,GEN,NGEN,0.00493,0.99507
3,3,GEN,NGEN,0.004235,0.995765
4,4,GEN,NGEN,0.004047,0.995953


(1200, 5)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.17,ko,NGEN
1,C52.39,ladkiyon video,NGEN
2,C52.73,ki video gahrep,NGEN
3,C60.3,o sadharon video bhai,NGEN
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,677,56.416667
GEN,523,43.583333


This is submission 1 for language hindi for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

Only data from hindi language was used for training.

This model was trained to predict labels for subtask B.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_1.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_1.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_1.txt'))

In [18]:
run_id=2
lang="HIN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../ALL/Sub-task B/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task B_preds,GEN_probs,NGEN_probs
0,HIN,GEN,NGEN,0.001812,0.998188
1,HIN,GEN,NGEN,0.000967,0.999033
2,HIN,GEN,NGEN,0.000867,0.999133
3,HIN,GEN,NGEN,0.000867,0.999133
4,HIN,GEN,NGEN,0.00088,0.99912


(1200, 5)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.17,ko,NGEN
1,C52.39,ladkiyon video,NGEN
2,C52.73,ki video gahrep,NGEN
3,C60.3,o sadharon video bhai,NGEN
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,648,54.0
GEN,552,46.0


This is submission 2 for language hindi for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


This model was trained to predict labels for subtask B.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_2.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_2.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_2.txt'))

In [20]:
run_id=3
lang="HIN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=True
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/hin/trac2_hin_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs
0,HIN,OAG-GEN,NAG-NGEN,0.000897,0.000871,0.001207,0.994551,0.000499,0.001975
1,HIN,OAG-GEN,NAG-NGEN,0.001878,0.004767,0.005382,0.76364,0.004214,0.220119
2,HIN,OAG-GEN,NAG-NGEN,0.002114,0.007861,0.001312,0.907107,0.001396,0.08021
3,HIN,OAG-GEN,NAG-NGEN,0.000562,0.00058,0.001241,0.995654,0.000435,0.001527
4,HIN,OAG-GEN,NAG-NGEN,0.000577,0.00066,0.001496,0.984942,0.000766,0.011559


(1200, 9)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.17,ko,NGEN
1,C52.39,ladkiyon video,NGEN
2,C52.73,ki video gahrep,NGEN
3,C60.3,o sadharon video bhai,NGEN
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,633,52.75
GEN,567,47.25


This is submission 3 for language hindi for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 


The label with the highest probability is selected as the predicted label. 
The subtask B component of this predicted label is used as a prediction for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC202

(WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_3.zip'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_3.csv'),
 WindowsPath('../run_submissions/HIN/bert-base-multilingual-uncased/3Idiots_hindi_B_run_3.txt'))

# Bengali

## Subtask A

In [23]:
run_id=1
lang="IBEN"
model="bert-base-multilingual-uncased"
task="A"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../ALL/Sub-task A/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task A_preds,OAG_probs,NAG_probs,CAG_probs
0,IBEN,OAG,NAG,0.003584,0.994934,0.001482
1,IBEN,OAG,NAG,0.011235,0.981533,0.007232
2,IBEN,OAG,CAG,0.023589,0.04253,0.933882
3,IBEN,OAG,OAG,0.560354,0.2668,0.172846
4,IBEN,OAG,CAG,0.064747,0.030444,0.904809


(1188, 6)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.4.1,ছিছ,NAG
1,C52.4.2,ভিকা.টিম,NAG
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,CAG
3,C52.7,bainchud,OAG
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,CAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,745,62.710438
OAG,245,20.622896
CAG,198,16.666667


This is submission 1 for language bengali for Sub-task A.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


This model was trained to predict labels for subtask A.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_A_run_1.zip'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_A_run_1.csv'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_A_run_1.txt'))

In [24]:
run_id=2
lang="IBEN"
model="xlm-roberta-base"
task="A"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../IBEN/Sub-task C/output/xlm-roberta-base/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,0,OAG-GEN,NAG-NGEN,0.001058,0.002624,0.005648,0.976086,0.002303,0.012281,0.014584,0.003682,0.981734,0.990991,0.00901,NGEN,NAG,NAG-NGEN
1,1,OAG-GEN,OAG-NGEN,0.054361,0.547756,0.011123,0.028651,0.033436,0.324673,0.358109,0.602117,0.039774,0.90108,0.098921,NGEN,OAG,OAG-NGEN
2,2,OAG-GEN,CAG-NGEN,0.009135,0.098274,0.0266,0.137471,0.033652,0.694868,0.72852,0.107409,0.164071,0.930613,0.069387,NGEN,CAG,CAG-NGEN
3,3,OAG-GEN,OAG-NGEN,0.177018,0.451966,0.045201,0.144037,0.053909,0.127868,0.181777,0.628984,0.189239,0.723871,0.276129,NGEN,OAG,OAG-NGEN
4,4,OAG-GEN,CAG-NGEN,0.051841,0.116457,0.056135,0.159745,0.11368,0.502142,0.615822,0.168299,0.21588,0.778344,0.221656,NGEN,CAG,CAG-NGEN


(1188, 17)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.4.1,ছিছ,NAG
1,C52.4.2,ভিকা.টিম,OAG
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,CAG
3,C52.7,bainchud,OAG
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,CAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,741,62.373737
OAG,262,22.053872
CAG,185,15.572391


This is submission 2 for language bengali for Sub-task A.
The submission was generated using the xlm-roberta-base transformer model. 
The model only used the text as a feature for prediction. 

Only data from bengali language was used for training.

For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask A are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_2.zip'),
 WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_2.csv'),
 WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_2.txt'))

In [25]:
run_id=3
lang="IBEN"
model="xlm-roberta-base"
task="A"
use_task_C=True
use_task_C_marginalized=False
use_all_lang=False

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../IBEN/Sub-task C/output/xlm-roberta-base/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs
0,0,OAG-GEN,NAG-NGEN,0.001058,0.002624,0.005648,0.976086,0.002303,0.012281
1,1,OAG-GEN,OAG-NGEN,0.054361,0.547756,0.011123,0.028651,0.033436,0.324673
2,2,OAG-GEN,CAG-NGEN,0.009135,0.098274,0.0266,0.137471,0.033652,0.694868
3,3,OAG-GEN,OAG-NGEN,0.177018,0.451966,0.045201,0.144037,0.053909,0.127868
4,4,OAG-GEN,CAG-NGEN,0.051841,0.116457,0.056135,0.159745,0.11368,0.502142


(1188, 9)


Unnamed: 0,ID,Text,Sub-task A_preds
0,C52.4.1,ছিছ,NAG
1,C52.4.2,ভিকা.টিম,OAG
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,CAG
3,C52.7,bainchud,OAG
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,CAG


Unnamed: 0,Sub-task A_preds,proportion
NAG,742,62.457912
OAG,263,22.138047
CAG,183,15.40404


This is submission 3 for language bengali for Sub-task A.
The submission was generated using the xlm-roberta-base transformer model. 
The model only used the text as a feature for prediction. 

Only data from bengali language was used for training.

For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 


The label with the highest probability is selected as the predicted label. 
The subtask A component of this predicted label is used as a prediction for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_3.zip'),
 WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_3.csv'),
 WindowsPath('../run_submissions/IBEN/xlm-roberta-base/3Idiots_bengali_A_run_3.txt'))

## Subtask B

In [26]:
run_id=1
lang="IBEN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=False
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../ALL/Sub-task B/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task B_preds,GEN_probs,NGEN_probs
0,IBEN,GEN,NGEN,0.003836,0.996164
1,IBEN,GEN,NGEN,0.002055,0.997945
2,IBEN,GEN,NGEN,0.002763,0.997237
3,IBEN,GEN,GEN,0.997791,0.002209
4,IBEN,GEN,NGEN,0.010543,0.989456


(1188, 5)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.4.1,ছিছ,NGEN
1,C52.4.2,ভিকা.টিম,NGEN
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,NGEN
3,C52.7,bainchud,GEN
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,962,80.976431
GEN,226,19.023569


This is submission 1 for language bengali for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


This model was trained to predict labels for subtask B.

The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2020



(WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_1.zip'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_1.csv'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_1.txt'))

In [27]:
run_id=2
lang="IBEN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=True
use_task_C_marginalized=True
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs,CAG_probs,OAG_probs,NAG_probs,NGEN_probs,GEN_probs,Sub-task B_preds,Sub-task A_preds,Sub-task C_mpreds
0,IBEN,OAG-GEN,NAG-NGEN,0.001259,0.001188,0.001506,0.992234,0.000709,0.003102,0.003812,0.002447,0.993741,0.996525,0.003475,NGEN,NAG,NAG-NGEN
1,IBEN,OAG-GEN,NAG-NGEN,0.011476,0.052466,0.012539,0.79553,0.016352,0.111637,0.127989,0.063942,0.808069,0.959632,0.040368,NGEN,NAG,NAG-NGEN
2,IBEN,OAG-GEN,NAG-GEN,0.069084,0.014398,0.622349,0.026099,0.219367,0.048702,0.268069,0.083482,0.648448,0.0892,0.9108,GEN,NAG,NAG-GEN
3,IBEN,OAG-GEN,OAG-GEN,0.93382,0.021404,0.01521,0.004522,0.022538,0.002505,0.025043,0.955225,0.019733,0.028431,0.971569,GEN,OAG,OAG-GEN
4,IBEN,OAG-GEN,CAG-NGEN,0.008779,0.029773,0.030899,0.150874,0.056459,0.723216,0.779676,0.038552,0.181773,0.903863,0.096137,NGEN,CAG,CAG-NGEN


(1188, 17)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.4.1,ছিছ,NGEN
1,C52.4.2,ভিকা.টিম,NGEN
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,GEN
3,C52.7,bainchud,GEN
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,954,80.30303
GEN,234,19.69697


This is submission 2 for language bengali for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 

For this model the marginal probabilities of labels for subtask B are computed.
Then the label with the highest marginal probability is used to identify the predicted label for this subtask.


The full code for the submission will be made available at: https://github

(WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_2.zip'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_2.csv'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_2.txt'))

In [28]:
run_id=3
lang="IBEN"
model="bert-base-multilingual-uncased"
task="B"
use_task_C=True
use_task_C_marginalized=False
use_all_lang=True

write_run_submission_files(
    lang, model, task, run_id, data_key="test", 
    use_task_C=use_task_C, use_task_C_marginalized=use_task_C_marginalized,
    use_all_lang=use_all_lang
)

../data/raw/iben/trac2_iben_test.csv ../ALL/Sub-task C/output/bert-base-multilingual-uncased/test.tsv


Unnamed: 0,id,label,Sub-task C_preds,OAG-GEN_probs,OAG-NGEN_probs,NAG-GEN_probs,NAG-NGEN_probs,CAG-GEN_probs,CAG-NGEN_probs
0,IBEN,OAG-GEN,NAG-NGEN,0.001259,0.001188,0.001506,0.992234,0.000709,0.003102
1,IBEN,OAG-GEN,NAG-NGEN,0.011476,0.052466,0.012539,0.79553,0.016352,0.111637
2,IBEN,OAG-GEN,NAG-GEN,0.069084,0.014398,0.622349,0.026099,0.219367,0.048702
3,IBEN,OAG-GEN,OAG-GEN,0.93382,0.021404,0.01521,0.004522,0.022538,0.002505
4,IBEN,OAG-GEN,CAG-NGEN,0.008779,0.029773,0.030899,0.150874,0.056459,0.723216


(1188, 9)


Unnamed: 0,ID,Text,Sub-task B_preds
0,C52.4.1,ছিছ,NGEN
1,C52.4.2,ভিকা.টিম,NGEN
2,C52.6,erokom fokinni boro bon na thaklei vlo hoi je ...,GEN
3,C52.7,bainchud,GEN
4,C52.10,এই মানুষটি বাস্তবের শয়তানকে ফেল করিয়ে দিয়েছ...,NGEN


Unnamed: 0,Sub-task B_preds,proportion
NGEN,953,80.218855
GEN,235,19.781145


This is submission 3 for language bengali for Sub-task B.
The submission was generated using the bert-base-multilingual-uncased transformer model. 
The model only used the text as a feature for prediction. 

This model was trained on texts from all languages. 
For this submission a multilingual model was used. 
The model was then used to predict labels for each language.


For this prediction we combined labels for subtasks A and B. 
Then the model was trained to predict the combined labels. 
The combined predicted labels were then splitted to identify the subtask specific labels. 
The combined label prediction task is referred to as subtask C by us. 
This model results in predicted probabilities for each combined label. 


The label with the highest probability is selected as the predicted label. 
The subtask B component of this predicted label is used as a prediction for this subtask.


The full code for the submission will be made available at: https://github.com/SocialMediaIE/TRAC2

(WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_3.zip'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_3.csv'),
 WindowsPath('../run_submissions/IBEN/bert-base-multilingual-uncased/3Idiots_bengali_B_run_3.txt'))