# Create instruct data from

**Data sources**
1. Inkuba-Instruct
2. XNLI (Swahili data)

In [None]:
import os
from huggingface_hub import login

os.environ["HF_TOKEN"] = ""
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import pandas as pd
import dask.dataframe as dd

In [None]:
extract_mt_from_hub = True
extract_sent_from_hub = True
extract_xnli_from_hub = True
additional_mt_data = False

In [None]:
from pathlib import Path
basedir = Path("./")

In [5]:
splits = {
    'english_dev': 'data/english_dev-*-of-*.parquet',
    'english_train': 'data/english_train-*-of-*.parquet',
    'hausa_dev': 'data/hausa_dev-00000-of-00001.parquet',
    'hausa_train': 'data/hausa_train-*-of-*.parquet',
    'isizulu_dev': 'data/isizulu_dev-*-of-*.parquet',
    'isizulu_train': 'data/isizulu_train-*-of-*.parquet',
    'swahili_dev': 'data/swahili_dev-*-of-*.parquet',
    'swahili_train': 'data/swahili_train-*-of-*.parquet',
    'xhosa_dev': 'data/xhosa_dev-*-of-*.parquet',
    'xhosa_train': 'data/xhosa_train-*-of-*.parquet',
    'yoruba_dev': 'data/yoruba_dev-00000-of-00001.parquet',
    'yoruba_train': 'data/yoruba_train-*-of-*.parquet'
}
base_path = "hf://datasets/lelapa/Inkuba-instruct/"
# extract = ['hausa_dev', 'hausa_train', 'swahili_dev', 'swahili_train']
# extract = ['swahili_dev', 'swahili_train']
# extract = ['hausa_dev', 'hausa_train']
# extract = ['english_dev', 'english_train']

In [6]:
def _row2dict(row):
    return {
        "task": row["task"], "instruction_orig": row["instruction"], "input": row["inputs"],
        "output": row["targets"], "language": row["lang"]
    }

## MT data

In [7]:
if extract_mt_from_hub:
    extract = ['english_dev', 'english_train']
    paths = [base_path + splits[k] for k in extract]
    df = dd.read_parquet(paths)
    df = df[
        (df["task"] == 'mmt')
        & (df["instruction"].str.contains("swahili")
        | df["instruction"].str.contains("hausa"))
    ]
    df.to_parquet("inkuba_instruct_mmt")

In [8]:
df = dd.read_parquet(basedir/"inkuba_instruct_mmt/*")
df[df["task"] == "mmt"].head()

Unnamed: 0,instruction,inputs,targets,task,data_source
0,translate the following from english into swah...,dessel joins the sith army.[1],saint nikitas the sinaite[21],mmt,wmt22
1,translate the following from english into swah...,"""there's some good cars here.",magari mazuri hapa,mmt,wmt22
2,could you convert this english text to swahili?,you're committed to something,utakuwa una upungufu wa kitu fulani,mmt,wmt22
3,please convert this english content into swahili.,hold the winds until god's people are ready to...,hadi kazi ya kupigwa muhuri kwa watu wa mungu ...,mmt,wmt22
4,please convert this english content into swahili.,perth and fremantle can be easily explored on ...,perth na fremantle wanaweza kuchunuliwa kwa ra...,mmt,wmt22


In [9]:
df[df["task"] == "mmt"]["data_source"].value_counts().compute()

data_source
mafand      103301
wmt22     61604701
Name: count, dtype: int64[pyarrow]

In [10]:
df["mmt_hausa"] = df["instruction"].str.contains("hausa")
df[df["task"] == "mmt"]["mmt_hausa"].value_counts().compute()

mmt_hausa
False    56144270
True      5563732
Name: count, dtype: Int64

In [11]:
totals = df[df["task"] == "mmt"].groupby(["data_source", "mmt_hausa"])["task"].count().compute()
totals

data_source  mmt_hausa
mafand       False           82563
             True            20738
wmt22        False        56061707
             True          5542994
Name: task, dtype: int64

In [12]:
n_mmt_wmt22_swahili_total = 50_000
n_mmt_wmt22_hausa_total = 80_000
n_mmt_mafand_swahili_total = 50_000
n_mmt_mafand_hausa_total = 20_000

custom_instructions = {
    "swahili": "Translate the following from English into Swahili.",
    "hausa": "Translate the following from English into Hausa."
}

df_mmt_wmt22_swahili = (df[(df["data_source"] == "wmt22") & (df["mmt_hausa"] == False)]
    .sample(frac=n_mmt_wmt22_swahili_total/totals.loc[("wmt22", False)], random_state=123)).compute()
df_mmt_wmt22_swahili["lang"] = "swahili"

df_mmt_wmt22_hausa = (df[(df["data_source"] == "wmt22") & (df["mmt_hausa"] == True)]
    .sample(frac=n_mmt_wmt22_hausa_total/totals.loc[("wmt22", True)], random_state=123)).compute()
df_mmt_wmt22_hausa["lang"] = "hausa"

df_mmt_mafand_swahili = (df[(df["data_source"] == "mafand") & (df["mmt_hausa"] == False)]
    .sample(frac=n_mmt_mafand_swahili_total/totals.loc[("mafand", False)], random_state=123)).compute()
df_mmt_mafand_swahili["lang"] = "swahili"

df_mmt_mafand_hausa = (df[(df["data_source"] == "mafand") & (df["mmt_hausa"] == True)]
    .sample(frac=n_mmt_mafand_hausa_total/totals.loc[("mafand", True)], random_state=123)).compute()
df_mmt_mafand_hausa["lang"] = "hausa"

mt_examples = []
dfs = [df_mmt_wmt22_swahili, df_mmt_wmt22_hausa, df_mmt_mafand_swahili, df_mmt_mafand_hausa]
for df_ in dfs:
    for _, row in df_.iterrows():
        d = _row2dict(row)
        d["instruction"] = custom_instructions[d["language"]]
        mt_examples.append(d)

In [13]:
len(mt_examples)

199999

In [None]:
import json

with open(basedir/"inkuba_instruct_sample_mt.json", "w") as f:
    json.dump(mt_examples, f)

#### Create additonal MT data

In [None]:
import json

with open(basedir/"inkuba_instruct_sample_mt.json", "r") as f:
    mt_examples = json.load(f)
len(mt_examples)

In [None]:
mt_inputs = set([d["input"] for d in mt_examples])
len(mt_inputs)

In [None]:
if additional_mt_data:
    n_swa, n_hau = 500_000, 500_000

    custom_instructions = {
        "swahili": "Translate the following from English into Swahili.",
        "hausa": "Translate the following from English into Hausa."
    }

    add_mt_examples_swa, add_mt_examples_hau = [], []
    last_milestone = 0
    for _, row in df.iterrows():
        if row["inputs"] in mt_inputs:
            continue
        if not row["mmt_hausa"] and len(add_mt_examples_swa) < n_swa:
            row["lang"] = "swahili"
            d = _row2dict(row)
            d["instruction"] = custom_instructions["swahili"]
            add_mt_examples_swa.append(d)
        elif row["mmt_hausa"] and len(add_mt_examples_hau) < n_hau:
            row["lang"] = "hausa"
            d = _row2dict(row)
            d["instruction"] = custom_instructions["hausa"]
            add_mt_examples_hau.append(d)
        if len(add_mt_examples_swa) >= n_swa and len(add_mt_examples_hau) >= n_hau:
            break
        n = len(add_mt_examples_swa) + len(add_mt_examples_hau)
        if n % 50_000 == 0:
            if n > last_milestone:
                last_milestone = n
                print(n, "examples processed")
    add_mt_examples = add_mt_examples_swa + add_mt_examples_hau
    len(add_mt_examples), len(add_mt_examples_swa), len(add_mt_examples_hau)

In [None]:
import json

with open(basedir/"inkuba_instruct_sample_mt_additional.json", "w") as f:
    json.dump(add_mt_examples, f)

## Sentiment

In [14]:
if extract_sent_from_hub:
    # extract = ['hausa_dev', 'hausa_train', 'swahili_dev', 'swahili_train']
    # Swahili
    extract = ['swahili_dev', 'swahili_train']
    paths = [base_path + splits[k] for k in extract]
    df = dd.read_parquet(paths)
    df = df[(df["task"] == 'sentiment')]
    df.to_parquet(basedir/"inkuba_instruct_sent_swahili")

    # Hausa
    extract = ['hausa_dev', 'hausa_train']
    paths = [base_path + splits[k] for k in extract]
    df = dd.read_parquet(paths)
    df = df[(df["task"] == 'sentiment')]
    df.to_parquet(basedir/"inkuba_instruct_sent_hausa")

In [15]:
df_comp = pd.read_csv(basedir/"data/SentimentTrain.csv")
# df_comp = pd.read_csv("/kaggle/input/vulavula-data/SentimentTrain.csv")
print(df_comp.shape)
df_comp.head()

(400, 7)


Unnamed: 0,ID,task,langs,data_source,instruction,inputs,targets
0,ID_6aba33a1_sentiment_ dev_hausa,sentiment,hausa,afrisenti,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user @user allah ya tsayyaba yar uwa 🎂 😍,Kyakkyawa
1,ID_ce64d307_sentiment_ dev_hausa,sentiment,hausa,naijasenti,Da fatan za a gano ra'ayin da ke cikin wannan ...,@user intenet a masallachi😭😭😭 wani salo ne na ...,Tsaka-tsaki
2,ID_dfb02831_sentiment_ dev_swahili,sentiment,swahili,swahili_tweet,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,picha mbunge wa kilombero peter lijualikali ak...,Wastani
3,ID_2efc9515_sentiment_ dev_hausa,sentiment,hausa,afrisenti,Gano ra'ayin da aka bayyana a cikin wannan rub...,@user @user @user @user @user hhh amma rahama ...,Tsaka-tsaki
4,ID_ad1d9888_sentiment_ dev_swahili,sentiment,swahili,afrisenti,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,swali zuri sana nawatafuta wajuzi wa mambo wat...,Wastani


### Swahili sentiment data

In [16]:
df_swa = pd.read_parquet(basedir/"inkuba_instruct_sent_swahili").reset_index(drop=True)
print(len(df_swa))
df_swa.head()

16847


Unnamed: 0,instruction,inputs,targets,task,data_source
0,Changanua mawazo ya matini yanayofuata na uain...,habari tunaomba radhi kwa usumbufu unaojitokez...,Wastani,sentiment,afrisenti
1,,mimi watani wangu hawajawahi kuvuka mipaka ila...,Hasi,sentiment,swahili_tweet
2,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,anasema kuvuka mia kukanyaga mavi bure,Wastani,sentiment,afrisenti
3,,na hivi ndivyo viwango vyetu vya kubadilishia ...,Wastani,sentiment,afrisenti
4,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,sisi piahakika nyie ndio mlikua wapinzani h...,Chanya,sentiment,swahili_tweet


In [17]:
df_swa["instruction"].isna().sum()

8392

In [18]:
# df_swa = df_swa.drop_duplicates(subset=["inputs", "targets"]).copy()
# print(len(df_swa))

In [19]:
df_swa = df_swa.drop_duplicates(subset=["instruction", "inputs", "targets"]).copy()
print(len(df_swa))
df_swa["instruction"].isna().sum()

11084


4055

In [20]:
df_swa["instruction"].value_counts(dropna=False)

instruction
<NA>                                                                                                                                                                                                                                                                                                                                           4055
Tafadhali tambua mawazo yaliyoonyeshwa kwenye matini haya kwa kutegemea miongozo ifuatayo: Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja.           3518
Changanua mawazo ya matini yanayofuata na uainishe matini hayo katika mojawapo ya lebo zifuatazo. Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. Hasi: iwapo matini yanadokeza mawazo au hisia hasi. Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au 

In [21]:
instructions = []
instr_options = list(set(df_swa["instruction"].dropna()))
for i, t in enumerate(df_swa["instruction"]):
    if pd.isna(t):
        idx = i % len(instr_options)
        instructions.append(instr_options[idx])
    else:
        instructions.append(t)
df_swa["instruction"] = instructions
df_swa["instruction"].isna().sum()

0

In [22]:
df_swa = df_swa.drop_duplicates(subset=["instruction", "inputs", "targets"]).copy()
print(len(df_swa))

7637


In [23]:
df_swa["in_comp_data"] = df_swa["inputs"].isin(df_comp["inputs"])

In [24]:
dups = df_swa.groupby("inputs")[["targets"]].nunique()
dups = set(dups[dups["targets"] > 1].index)
df_swa[df_swa["inputs"].isin(dups)].sort_values("inputs")

Unnamed: 0,instruction,inputs,targets,task,data_source,in_comp_data
1109,Changanua mawazo ya matini yanayofuata na uain...,habari tunaomba namba yako dm kwa msaada zaidika,Chanya,sentiment,afrisenti,False
3996,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,habari tunaomba namba yako dm kwa msaada zaidika,Chanya,sentiment,afrisenti,False
9578,Changanua mawazo ya matini yanayofuata na uain...,habari tunaomba namba yako dm kwa msaada zaidika,Wastani,sentiment,swahili_tweet,False
10416,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,habari tunaomba namba yako dm kwa msaada zaidika,Wastani,sentiment,afrisenti,False
1164,Changanua mawazo ya matini yanayofuata na uain...,tunafunga sana kumi na nusu jionikaribu sana,Chanya,sentiment,afrisenti,True
2299,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,tunafunga sana kumi na nusu jionikaribu sana,Wastani,sentiment,afrisenti,True
2444,Changanua mawazo ya matini yanayofuata na uain...,tunafunga sana kumi na nusu jionikaribu sana,Wastani,sentiment,afrisenti,True
10249,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,tunafunga sana kumi na nusu jionikaribu sana,Chanya,sentiment,swahili_tweet,True
135,Tafadhali tambua mawazo yaliyoonyeshwa kwenye ...,upo eneo gani mteja tunaomba details vizuri il...,Chanya,sentiment,swahili_tweet,False
2502,Changanua mawazo ya matini yanayofuata na uain...,upo eneo gani mteja tunaomba details vizuri il...,Chanya,sentiment,afrisenti,False


In [25]:
df_swa = df_swa[~df_swa["inputs"].isin(dups)].copy()
print(len(df_swa))
assert len(df_swa) == len(df_swa[["instruction", "inputs"]].drop_duplicates())

7625


In [26]:
df_swa["in_comp_data"].value_counts()

in_comp_data
False    7292
True      333
Name: count, dtype: int64

In [27]:
df_swa["lang"] = "swahili"

### Hausa sentiment data

In [28]:
df_hau = pd.read_parquet(basedir/"inkuba_instruct_sent_hausa").reset_index(drop=True)
print(len(df_hau))
df_hau.head()

141814


Unnamed: 0,instruction,inputs,targets,task,data_source
0,Da fatan za a gano ra'ayin da ke cikin wannan ...,gaskiyane ansha kyau sosaiiiii allah ya barmu ...,Kyakkyawa,sentiment,naijasenti
1,Za ka iya tantance yanayin wannan rubutu? Bi w...,kai wai kudai wane bayahude ne ke typing in nan,Tsaka-tsaki,sentiment,afrisenti
2,Tantance ra’ayin wannan rubutu kuma a rarraba ...,@user ina ruwan ku da yanda yake gudana tunda ...,Korau,sentiment,afrisenti
3,Tantance ra’ayin wannan rubutu kuma a rarraba ...,gaskiya kinsha dalma,Korau,sentiment,afrisenti
4,Da fatan za a gano ra'ayin da ke cikin wannan ...,@user @user su tsigeshi kawai dan bayada wani ...,Korau,sentiment,afrisenti


In [30]:
df_hau = df_hau.drop_duplicates(subset=["instruction", "inputs", "targets"]).copy()
print(len(df_hau))
df_hau["instruction"].isna().sum()

84344


0

In [31]:
df_hau["instruction"].value_counts(dropna=False)

instruction
Tantance ra’ayin wannan rubutu kuma a rarraba rubutun zuwa ɗaya daga cikin waɗannan rukunoni. Kyakkyawa: idan rubutu yana nuna kyakkyawan tunani, hali, da yanayin motsin rai. Korau: idan rubutu yana nuna mummunar tunani ko motsin rai. Tsaka-tsaki: idan rubutu baya nufin magana mai kyau ko mara kyau kai tsaye ko a kaikaice.    21123
Da fatan za a gano ra'ayin da ke cikin wannan rubutu bisa ga jagorori masu zuwa: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.                                      21084
Za ka iya tantance yanayin wannan rubutu? Bi waɗannan jagororin sharhi: Kyakkyawa: idan rubutu na nuna kyakkyawan tunani, hali, da yanayi. Korau: idan rubutu yana nuna mummunar tunani ko yanayi. Neutral: idan rubutu baya nuna kyakkyawar magana ko mara kyau kai tsaye ko a kaikaice.                                       

In [32]:
df_hau["instruction"] = df_hau["instruction"].str.replace("Neutral:", "Tsaka-tsaki:")

In [33]:
df_hau = df_hau.drop_duplicates(subset=["instruction", "inputs", "targets"]).copy()
print(len(df_hau))

84344


In [34]:
df_hau["in_comp_data"] = df_hau["inputs"].isin(df_comp["inputs"])

In [35]:
dups = df_hau.groupby("inputs")[["targets"]].nunique()
dups = set(dups[dups["targets"] > 1].index)
df_hau[df_hau["inputs"].isin(dups)].sort_values("inputs")

Unnamed: 0,instruction,inputs,targets,task,data_source,in_comp_data
54944,Gano ra'ayin da aka bayyana a cikin wannan rub...,@user allah ya isa domin ba za mu yafe irin ta...,Korau,sentiment,naijasenti,False
85438,Da fatan za a gano ra'ayin da ke cikin wannan ...,@user allah ya isa domin ba za mu yafe irin ta...,Kyakkyawa,sentiment,afrisenti,False
75596,Tantance ra’ayin wannan rubutu kuma a rarraba ...,@user allah ya isa domin ba za mu yafe irin ta...,Kyakkyawa,sentiment,naijasenti,False
30007,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user allah ya isa domin ba za mu yafe irin ta...,Korau,sentiment,afrisenti,False
17487,Da fatan za a gano ra'ayin da ke cikin wannan ...,@user allah ya isa domin ba za mu yafe irin ta...,Korau,sentiment,afrisenti,False
...,...,...,...,...,...,...
50160,Gano ra'ayin da aka bayyana a cikin wannan rub...,@user 😢😢😢😢😢😢 mu yau aka mana low down. wlh ba ...,Korau,sentiment,afrisenti,False
52833,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user 😢😢😢😢😢😢 mu yau aka mana low down. wlh ba ...,Kyakkyawa,sentiment,naijasenti,False
67164,Tantance ra’ayin wannan rubutu kuma a rarraba ...,@user 😢😢😢😢😢😢 mu yau aka mana low down. wlh ba ...,Korau,sentiment,afrisenti,False
12535,Za ka iya tantance yanayin wannan rubutu? Bi w...,@user 😢😢😢😢😢😢 mu yau aka mana low down. wlh ba ...,Korau,sentiment,afrisenti,False


In [36]:
df_hau = df_hau[~df_hau["inputs"].isin(dups)].copy()
print(len(df_hau))
assert len(df_hau) == len(df_hau[["instruction", "inputs"]])

84244


In [37]:
df_hau["in_comp_data"].value_counts()

in_comp_data
False    83471
True       773
Name: count, dtype: int64

In [38]:
df_hau["lang"] = "hausa"

### Combine sentiment data

In [39]:
df_swa = df_swa[~df_swa["in_comp_data"]].copy()
df_hau = df_hau[~df_hau["in_comp_data"]].copy()

In [40]:
len(df_swa) + len(df_hau)

90763

In [41]:
use_custom_instructions = False

custom_instructions_sent = {
    "swahili": (
        "Changanua mawazo ya matini yanayofuata na uainishe matini hayo katika mojawapo ya lebo zifuatazo. "
        "Chanya: iwapo matini yanadokeza mawazo, mtazamo na hali chanya ya kihisia. "
        "Hasi: iwapo matini yanadokeza mawazo au hisia hasi. "
        "Wastani: iwapo matini hayadokezi lugha chanya au hasi kwa njia ya moja kwa moja au isiyo ya moja kwa moja."
    ),
    "hausa": (
        "Tantance ra’ayin wannan rubutu kuma a rarraba rubutun zuwa ɗaya daga cikin waɗannan rukunoni. "
        "Kyakkyawa: idan rubutu yana nuna kyakkyawan tunani, hali, da yanayin motsin rai. "
        "Korau: idan rubutu yana nuna mummunar tunani ko motsin rai. "
        "Tsaka-tsaki: idan rubutu baya nufin magana mai kyau ko mara kyau kai tsaye ko a kaikaice."
    )
}

sent_examples = []
dfs = [df_swa, df_hau]
for df_ in dfs:
    df_["instruction"] = df_["instruction"].fillna("")
    for _, row in df_.iterrows():
        d = _row2dict(row)
        if use_custom_instructions:
            d["instruction"] = custom_instructions_sent[d["language"]]
        else:
            d["instruction"] = row["instruction"]
        sent_examples.append(d)

In [42]:
assert len(sent_examples) == len(df_swa) + len(df_hau)

In [None]:
import json

with open(basedir/"inkuba_instruct_sample_sent.json", "w") as f:
    json.dump(sent_examples, f)

## XNLI data

In [43]:
if extract_xnli_from_hub:
    splits = {'train': 'sw/train-*.parquet', 'test': 'sw/test-*.parquet', 'validation': 'sw/validation-*.parquet'}
    ddf = dd.read_parquet("hf://datasets/facebook/xnli/" + splits["train"])
    ddf.to_parquet(basedir/"xnli_swa_train")

In [44]:
df_comp = pd.read_csv(basedir/"data/XNLITrain.csv")
print(df_comp.shape)
xnli_inputs = set(df_comp["inputs"].tolist())

(400, 6)


In [45]:
ddf = dd.read_parquet(basedir/"xnli_swa_train")
len(ddf)

392702

Look at class distribution:

In [46]:
ddf["label"].value_counts().compute()

label
0    130899
1    130900
2    130903
Name: count, dtype: int64

### Create the dataset

 - Create the Swahili dataset by selecting 10,000 examples from each class.
 - Create the Hausa dataset by translating the Swahili examples using the NLLB-200 model

Load the `NLLB-200` model and define the translation function:

In [47]:
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM

model_id = "facebook/nllb-200-3.3B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.bfloat16)
tokenizer = NllbTokenizer.from_pretrained(model_id)
print(f"Memory footprint: {model.get_memory_footprint() / 1024**3 :.2f}GB")

  warn(
2025-04-09 10:10:44.462510: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-09 10:10:44.462608: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-09 10:10:44.464109: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 10:10:44.471344: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Memory footprint: 6.24GB


In [48]:
def translate(
    article, model, tokenizer, src_lang="dyu_Latn", tgt_lang="fra_Latn",
    max_length=30, num_beams=1, do_sample=False, temperature=None
):
    """
    Translates a given text using a specified model and tokenizer.

    Args:
        article (`str`):
            The text to be translated from the source language to the target language.
        model (`transformers.PreTrainedModel`):
            The pre-trained model used for generating translations.
        tokenizer (`transformers.PreTrainedTokenizer`):
            The tokenizer used for encoding the input text and decoding the generated output.
        src_lang (`str`, optional):
            The source language code for the tokenizer. Default is `"dyu_Latn"`.
        tgt_lang (`str`, optional):
            The target language code for the tokenizer. Default is `"fra_Latn"`.
        max_length (`int`, optional):
            The maximum length of the generated translation. Default is `30`.
        num_beams (`int`, optional):
            The number of beams for beam search. Default is `1`. If set to `1`, it uses
            greedy decoding.
        do_sample (`bool`, optional):
            Whether to use sampling instead of greedy decoding. Default is `False`.
        temperature (`float`, optional):
            The temperature to use for sampling. Higher values (e.g., 1.0) result in more
            diverse outputs, while lower values (e.g., 0.5) make the output more deterministic.
            Default is `None`.

    Returns:
        list of `str`: 
            The translated text(s) as a list of strings. If the input is a single string,
            the output will be a list with one translation.
    """
    tokenizer.src_lang = src_lang
    inputs = tokenizer(
        article, return_tensors='pt', padding=True, truncation=True, max_length=128
    )
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_length=max_length,
        num_beams=num_beams,
        do_sample=do_sample,
        temperature=temperature
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

Generate translations. This takes a while...

In [None]:
instruction = (
    "You will be given two {lang} sentences. Your job is to  is to predict textual entailment. "
    "In other words, does sentence A imply/contradict/neither sentence B. "
    "Your response must be one word: entailment, contradiction, or neutral."
)
print(instruction)

label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}

max_per_class = 10_000
counts = {0: 0, 1: 0, 2: 0}
qa_data = []
for _, row in ddf.iterrows():
    tgt = row["label"]
    if counts[tgt] >= max_per_class:
        continue
    if row["hypothesis"] not in xnli_inputs:
        d = {
            "task": "qa",
            "instruction": instruction.format(lang="Swahili"),
            "input": f"Sentence A: {row['premise']}\nSentence B: {row['hypothesis']}",
            "output": label_map[tgt],
            "lang": "swahili"
        }
        qa_data.append(d)
        # translate the sentences to Hausa
        hau_prem, hau_hyp = translate(
            [row["premise"], row["hypothesis"]], model, tokenizer, src_lang="swh_Latn", tgt_lang="hau_Latn",
            max_length=100, num_beams=4, do_sample=False, temperature=None
        )
        d = {
            "task": "qa",
            "instruction": instruction.format(lang="Hausa"),
            "input": f"Sentence A: {hau_prem}\nSentence B: {hau_hyp}",
            "output": label_map[tgt],
            "lang": "hausa"
        }
        qa_data.append(d)
        counts[tgt] += 1
        if sum(counts.values()) % 1000 == 0:
            print(f"Processed {sum(counts.values())} of {max_per_class*len(counts)} samples")
        if all([c == max_per_class for c in counts.values()]):
            break
len(qa_data)

In [None]:
import json

with open(basedir/"inkuba_instruct_sample_qa.json", "w") as f:
    json.dump(qa_data, f)

## Combine all

In [49]:
comp_sent = pd.read_csv(basedir/"data/SentimentTrain.csv")
comp_qa = pd.read_csv(basedir/"data/XNLITrain.csv")
comp_mt = pd.read_csv(basedir/"data/MTTrain.csv")

In [50]:
comp_sent = set(comp_sent["inputs"])
comp_mt = set(comp_mt["inputs"])

comp_qa["inputs"] = "Sentence A: " + comp_qa["premise"] + "\nSentence B: " + comp_qa["inputs"]
comp_qa = set(comp_qa["inputs"])

In [52]:
import json

with open(basedir/"inkuba_instruct_sample_sent.json", "r") as f:
    sent_examples = json.load(f)

with open(basedir/"inkuba_instruct_sample_mt.json", "r") as f:
    mt_examples = json.load(f)

with open(basedir/"inkuba_instruct_sample_qa.json", "r") as f:
    qa_examples = json.load(f)

len(sent_examples), len(mt_examples), len(qa_examples)

(90763, 199999, 60000)

In [53]:
sent_examples = [d for d in sent_examples if d["input"] not in comp_sent]
mt_examples = [d for d in mt_examples if d["input"] not in comp_mt]
qa_examples = [d for d in qa_examples if d["input"] not in comp_qa]
len(sent_examples), len(mt_examples), len(qa_examples)

(90763, 199987, 60000)

In [54]:
data = sent_examples + mt_examples + qa_examples
len(data)

350750

In [55]:
tasks = {"qa": 0, "sentiment": 0, "mmt": 0}
for d in data:
    tasks[d["task"]] += 1
tasks

{'qa': 60000, 'sentiment': 90763, 'mmt': 199987}

In [None]:
with open(basedir/"inkuba_instruct_sample.json", "w") as f:
    json.dump(data, f)