**For singleData**

In [None]:
import random
import os

# Define paths to your dataset files
file_eng_latn_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.eng_Latn"
file_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.mal_Mlym"

# Function to read and split dataset
def split_dataset(file_eng_latn_mal_mlym, file_mal_mlym, train_size, test_size, val_size):
    with open(file_eng_latn_mal_mlym, 'r', encoding='utf-8') as f_eng, open(file_mal_mlym, 'r', encoding='utf-8') as f_mal:
        eng_lines = f_eng.readlines()
        mal_lines = f_mal.readlines()
        
        # Combine English and Malayalam lines into pairs
        data = list(zip(eng_lines, mal_lines))
        
        # Shuffle the data
        random.shuffle(data)
        
        # Split into train, test, and validation sets
        train_data = data[:train_size]
        test_data = data[train_size:train_size + test_size]
        val_data = data[train_size + test_size:train_size + test_size + val_size]
        
        return train_data, test_data, val_data

# Define the number of samples for train, test, and validation
train_size = 30000
test_size = 2000
val_size = 2000

# Split the dataset
train_data, test_data, val_data = split_dataset(file_eng_latn_mal_mlym, file_mal_mlym, train_size, test_size, val_size)

# Output the first few samples of each split for verification
print(f"Number of training samples: {len(train_data)}")
print(f"Number of testing samples: {len(test_data)}")
print(f"Number of validation samples: {len(val_data)}")

# Optionally, you can write these splits to new files if needed
# Example:
output_dir = "/path/to/output/directory"
os.makedirs(output_dir, exist_ok=True)

def write_dataset(data, output_file_eng, output_file_mal):
    with open(output_file_eng, 'w', encoding='utf-8') as f_eng, open(output_file_mal, 'w', encoding='utf-8') as f_mal:
        for eng, mal in data:
            f_eng.write(eng)
            f_mal.write(mal)


In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_dict({'translations': train_data})
val_dataset = Dataset.from_dict({'translations': val_data})
test_dataset = Dataset.from_dict({'translations': test_data})

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Print out the dataset_dict structure
print(dataset_dict)

**For MultipleData**

In [1]:
file_eng_latn_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.eng_Latn"
file_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.mal_Mlym"

with open(file_eng_latn_mal_mlym, 'r', encoding='utf-8') as f_eng_latn, open(file_mal_mlym, 'r', encoding='utf-8') as f_mal_mlym:
    eng_latn_mal_mlym_data = f_eng_latn.readlines()
    mal_mlym_data = f_mal_mlym.readlines()

# Combine into one dataset
eng_latn_mal_mlym_pairs = list(zip(eng_latn_mal_mlym_data, mal_mlym_data))


In [2]:
file_eng_latn_tam_taml = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-hin_Deva/train.eng_Latn"
file_tam_taml = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-hin_Deva/train.hin_Deva"

with open(file_eng_latn_tam_taml, 'r', encoding='utf-8') as f_eng_latn_tam_taml, open(file_tam_taml, 'r', encoding='utf-8') as f_tam_taml:
    eng_latn_tam_taml_data = f_eng_latn_tam_taml.readlines()
    tam_taml_data = f_tam_taml.readlines()

# Combine into one dataset
eng_latn_tam_taml_pairs = list(zip(eng_latn_tam_taml_data, tam_taml_data))


In [3]:
# Format English to Malayalam dataset
formatted_eng_mal_dataset = [
    f"{eng.strip()} #ml#> {mal.strip()}" for eng, mal in eng_latn_mal_mlym_pairs
]

# Format English to Hindi dataset
formatted_eng_hi_dataset = [
    f"{eng.strip()} #hi#> {tam.strip()}" for eng, tam in eng_latn_tam_taml_pairs
]


In [4]:
import random

# Combine the datasets
combined_dataset = formatted_eng_mal_dataset + formatted_eng_hi_dataset

In [5]:
import random

# # Shuffle the combined dataset
# random.shuffle(combined_dataset)

# Define the number of examples for each split
train_size = 30000
test_size = 2000
validation_size = 2000

# Initialize counters for each category (#ml#> and #hi#>)
ml_count_train = 0
hi_count_train = 0
ml_count_test = 0
hi_count_test = 0
ml_count_val = 0
hi_count_val = 0

# Initialize lists for train, test, and validation datasets
train_dataset = []
test_dataset = []
validation_dataset = []

# Iterate through the combined dataset
for pair in combined_dataset:
    if '#ml#>' in pair:
        if ml_count_train < train_size / 2:
            train_dataset.append(pair)
            ml_count_train += 1
        elif ml_count_test < test_size / 2:
            test_dataset.append(pair)
            ml_count_test += 1
        elif ml_count_val < validation_size / 2:
            validation_dataset.append(pair)
            ml_count_val += 1
    elif '#hi#>' in pair:
        if hi_count_train < train_size / 2:
            train_dataset.append(pair)
            hi_count_train += 1
        elif hi_count_test < test_size / 2:
            test_dataset.append(pair)
            hi_count_test += 1
        elif hi_count_val < validation_size / 2:
            validation_dataset.append(pair)
            hi_count_val += 1

# Verify the sizes of each dataset
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")
print(f"Validation Dataset Size: {len(validation_dataset)}")

Train Dataset Size: 30000
Test Dataset Size: 2000
Validation Dataset Size: 2000


In [6]:
from datasets import Dataset, DatasetDict

# Combine the datasets into a single list of dictionaries
combined_data = {
    "translations": train_dataset + validation_dataset + test_dataset
}

# Define lengths for each split
train_length = len(train_dataset)
validation_length = len(validation_dataset)
test_length = len(test_dataset)

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_dict(combined_data).select(range(train_length)),
    "validation": Dataset.from_dict(combined_data).select(range(train_length, train_length + validation_length)),
    "test": Dataset.from_dict(combined_data).select(range(train_length + validation_length, train_length + validation_length + test_length)),
})

# Print the structure and sizes of the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
})


In [None]:
# from datasets import DatasetDict
# # Shuffle only train and validation datasets
# dataset_dict['train'] = dataset_dict['train'].shuffle(seed=42)
# dataset_dict['validation'] = dataset_dict['validation'].shuffle(seed=42)

# print(dataset_dict)

In [7]:
dataset_dict['validation']['translations'][-1]

'The oldest reference to Rajasthan is found in a stone inscription dated back to 625 CE. #hi#> राजस्थान का सबसे प्राचीन संदर्भ 625 ईस्वी के एक पत्थर के शिलालेख में पाया गया है।'

In [8]:
dataset_dict['validation']['translations'][1]

'See if you can detect any differences between the notes by sight. #ml#> നോട്ടുകൾ തമ്മിൽ കാഴ്ചയില്\u200d എന്തെങ്കിലും വ്യത്യാസം ഉണ്ടോയെന്ന് നോക്കൂ.'

In [9]:
dataset_dict_shuffled = dataset_dict.shuffle(seed=42)

print(dataset_dict_shuffled)

DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
})


In [10]:
dataset_dict_shuffled['test']['translations'][-2]

"Check with the central bank of any country you're visiting for information on serial numbers. #ml#> സീരിയൽ നമ്പറുകളെക്കുറിച്ചുള്ള വിവരങ്ങൾക്കായി നിങ്ങൾ സന്ദർശിക്കുന്ന ഏതൊരു രാജ്യത്തിന്\u200dറെയും കേന്ദ്ര ബാങ്കുമായി ബന്ധപ്പെടുക."

In [30]:
from datasets import Dataset, DatasetDict

# Assume dataset_dict_shuffled is already defined and shuffled

# Filter out English to Hindi pairs (#hi#>) from the 'test' split
def filter_hi(example):
    return '#hi#>' in example['translations']

# Apply the filter to the 'test' split
test_hi_pairs = dataset_dict_shuffled['test'].filter(filter_hi)

# Create a new DatasetDict for the filtered pairs
dataset_dict_hi_test = DatasetDict({
    "test_hi": test_hi_pairs
})

# Print the structure and size of the new DatasetDict
print(dataset_dict_hi_test)


Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    test_hi: Dataset({
        features: ['translations'],
        num_rows: 1000
    })
})


In [33]:
dataset_dict_hi_test['test_hi']['translations'][4]

'Wellness checks are routine medical examinations that determine a dog’s overall health. #hi#> स्वास्थ्य की जांच नियमित स्वास्थ्य परीक्षण है जो कुत्ते के समस्त स्वास्थ्य को निर्धारित करती है।'

In [27]:
from datasets import Dataset, DatasetDict

# Assume dataset_dict_shuffled is already defined and shuffled

# Filter out English to Malayalam pairs (#ml#>) from the 'test' split
def filter_ml(example):
    return '#ml#>' in example['translations']

# Apply the filter to the 'test' split
test_ml_pairs = dataset_dict_shuffled['test'].filter(filter_ml)

# Create a new DatasetDict for the filtered pairs
dataset_dict_ml_test = DatasetDict({
    "test_ml": test_ml_pairs
})

# Print the structure and size of the new DatasetDict
print(dataset_dict_ml_test)

# Accessing information about the new DatasetDict



Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    test_ml: Dataset({
        features: ['translations'],
        num_rows: 1000
    })
})


In [29]:
dataset_dict_ml_test['test_ml']['translations'][-4]

"Surfaces that aren't slate can warp as they age while a slate surface will stay level and won't wear down. #ml#> ഒരു സ്ലേറ്റ് ഉപരിതലം കേടുവരാതെ നിരപ്പായി തന്നെ നിലകൊള്ളുമ്പോൾ സ്ലേറ്റ് അല്ലാത്ത ഉപരിതലങ്ങൾ കാലം കൂടുന്നതിനനുസരിച്ച് കേടുവരാൻ സാധ്യതയുണ്ട്."

In [112]:


# # Shuffle the combined dataset
# random.shuffle(combined_dataset)

# Split the dataset
total_size = len(combined_dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

train_data = combined_dataset[:train_size]
val_data = combined_dataset[train_size:train_size + val_size]
test_data = combined_dataset[train_size + val_size:]


In [114]:
combined_dataset[9]

'Lab tests conducted by Panat and Varanasi showed that the drop in energy output from the panels is steep, occurs at the very beginning of the process of dust accumulation, and can easily mark a 30% reduction in output after a month without cleaning. Even a 1% reduction in power, for a 150-megawatt solar installation, they calculated, could result in a $200,000 loss in annual revenue. #ml#> പാനലുകളുടെ ഊർജ്ജ ഉൽപ്പാദനം വേഗത്തിൽ കുറയുന്നു, പൊടി അടിഞ്ഞുകൂടാൻ തുടങ്ങുമ്പോൾ തന്നെ കുറയാൻ തുടങ്ങുന്നു, വൃത്തിയാക്കാതെ തന്നെ ഒരു മാസത്തിന് ശേഷം എളുപ്പത്തിൽ 30% കുറയും എന്ന് ലബോറട്ടറി പഠനങ്ങളിൽ നിന്ന് പാനറ്റും വാരണാസിയും കണ്ടെത്തി. 150 മെഗാവാട്ട് സോളാർ ഇൻസ്റ്റാളേഷനായി വൈദ്യുതിയിൽ 1% കുറവുണ്ടായാൽ പോലും, വാർഷിക വരുമാനത്തിൽ $200,000 നഷ്ടം സംഭവിക്കുമെന്ന് അവർ കണക്കാക്കി.'

In [46]:
def save_dataset(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(data))

save_dataset(train_data, "train_dataset.txt")
save_dataset(val_data, "val_dataset.txt")
save_dataset(test_data, "test_dataset.txt")


In [47]:
from datasets import Dataset, DatasetDict

def load_dataset_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return Dataset.from_dict({"text": [line.strip() for line in lines]})

train_dataset = load_dataset_from_file("train_dataset.txt")
val_dataset = load_dataset_from_file("val_dataset.txt")
test_dataset = load_dataset_from_file("test_dataset.txt")

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


In [48]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 65512
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 8189
    })
    test: Dataset({
        features: ['text'],
        num_rows: 8190
    })
})

In [49]:
from datasets import Dataset, DatasetDict

def load_dataset_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return Dataset.from_dict({"translations": [line.strip() for line in lines]})

train_dataset = load_dataset_from_file("train_dataset.txt")
val_dataset = load_dataset_from_file("val_dataset.txt")
test_dataset = load_dataset_from_file("test_dataset.txt")

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 65512
    })
    validation: Dataset({
        features: ['translations'],
        num_rows: 8189
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 8190
    })
})


In [60]:
dataset_dict['train'][9000]

{'translations': 'It is ritually the most valued species in the state. #mar#> ही राज्यातील विधीभूत सर्वात मौल्यवान प्रजाती आहे.'}

In [61]:
dataset_dict['train'][-8700]

{'translations': "Sandhyakar Nandi's semi-fictional epic Ramacharitam (12th century) is an important source of Pala history. #hi#> संध्याकर नंदी का अर्ध-काल्पनिक महाकाव्य रामचरितमानस (12वीं शताब्दी) पाल इतिहास का एक महत्वपूर्ण स्रोत है।"}

In [74]:
dataset_dict['test'][-8189]

{'translations': 'There have been attempts at conservation and reforestation. #hi#> वहाँ संरक्षण और वनीकरण के प्रयास किए गए हैं।'}

In [62]:
import pandas as pd

# Convert the Hugging Face datasets to Pandas DataFrames
train_df = pd.DataFrame(dataset_dict['train']["translations"], columns=["translations"])
test_df = pd.DataFrame(dataset_dict['test']["translations"], columns=["translations"])


In [64]:
len(train_df)

75694

In [65]:
len(test_df)

9463

In [73]:
test_df['translations'][9462]

'He was followed by his two sons who became kings in succession. #hi#> उनके बाद उनके दो बेटे हुए जो लगातार राजा बने।'

In [63]:
# Select 15,000 examples from train for English to Malayalam
train_ml = train_df[train_df['translations'].str.contains('#ml#>')].sample(n=15000, random_state=42)

# Select 15,000 examples from train for English to Hindi
train_hi = train_df[train_df['translations'].str.contains('#hi#>')].sample(n=15000, random_state=42)

# Select 1,000 examples from test for English to Malayalam
test_ml = test_df[test_df['translations'].str.contains('#ml#>')].sample(n=1000, random_state=42)

# Select 1,000 examples from test for English to Hindi
test_hi = test_df[test_df['translations'].str.contains('#hi#>')].sample(n=1000, random_state=42)


ValueError: a must be greater than 0 unless no samples are taken

In [54]:
import pandas as pd

# Assuming you have already loaded train and test datasets into DataFrames train_df and test_df

# Select 15,000 examples from train for English to Malayalam
train_ml = train_df[train_df['translations'].str.contains('#ml#>')].sample(n=15000, random_state=42)

# Select 15,000 examples from train for English to Hindi
train_hi = train_df[train_df['translations'].str.contains('#hi#>')].sample(n=15000, random_state=42)

# Select 1,000 examples from test for English to Malayalam if available
if len(test_df[test_df['translations'].str.contains('#ml#>')]) > 1000:
    test_ml = test_df[test_df['translations'].str.contains('#ml#>')].sample(n=1000, random_state=42)
else:
    test_ml = test_df[test_df['translations'].str.contains('#ml#>')]

# Select 1,000 examples from test for English to Hindi if available
if len(test_df[test_df['translations'].str.contains('#hi#>')]) > 1000:
    test_hi = test_df[test_df['translations'].str.contains('#hi#>')].sample(n=1000, random_state=42)
else:
    test_hi = test_df[test_df['translations'].str.contains('#hi#>')]

print("Train ML shape:", train_ml.shape)
print("Train HI shape:", train_hi.shape)
print("Test ML shape:", test_ml.shape)
print("Test HI shape:", test_hi.shape)


Train ML shape: (15000, 1)
Train HI shape: (15000, 1)
Test ML shape: (0, 1)
Test HI shape: (1000, 1)


In [62]:
from datasets import Dataset, DatasetDict

# Combine the selected data
selected_train_data = pd.concat([train_ml, train_hi])
selected_test_data = pd.concat([test_ml, test_hi])

# Create new Dataset objects
selected_train_dataset = Dataset.from_pandas(selected_train_data)
selected_test_dataset = Dataset.from_pandas(selected_test_data)

# Create new DatasetDict
selected_dataset_dict = DatasetDict({
    "train": selected_train_dataset,
    "test": selected_test_dataset
})

print(selected_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['translations', '__index_level_0__'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['translations', '__index_level_0__'],
        num_rows: 2000
    })
})


In [63]:
from datasets import DatasetDict

# Assuming selected_dataset_dict is already defined as in your previous messages

# Remove the '__index_level_0__' column from train dataset
selected_dataset_dict['train'] = selected_dataset_dict['train'].remove_columns('__index_level_0__')

# Remove the '__index_level_0__' column from test dataset
selected_dataset_dict['test'] = selected_dataset_dict['test'].remove_columns('__index_level_0__')

print(selected_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
})


In [64]:
from datasets import DatasetDict

# Assuming selected_dataset_dict is already defined as in your previous messages

# Shuffle the train dataset
selected_dataset_dict['train'] = selected_dataset_dict['train'].shuffle()

# Shuffle the test dataset
selected_dataset_dict['test'] = selected_dataset_dict['test'].shuffle()

print(selected_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 2000
    })
})


# SetUp Directory

In [1]:
%pwd

'/kaggle/working'

In [2]:
%cd ..

/kaggle


In [3]:
%ls

[0m[01;34minput[0m/  [01;34mlib[0m/  [01;34mworking[0m/


In [11]:
# %mkdir working/results/
![ ! -d working/results/ ] && mkdir -p working/results/

# Necessary Installs and Imports

## Installs

In [1]:
!pip install -U datasets transformers trl accelerate peft bitsandbytes

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.9.4-py3-none-any.whl.metadata (11 kB)
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting docstring-parser>=0.16 (from tyro>=0.5.11->trl)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 k

## HuggingFace SetUp

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [7]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

## Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

2024-07-02 14:17:51.950357: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:17:51.950461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:17:52.089801: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Model SetUp

In [3]:
from huggingface_hub import login

# Log in to Hugging Face Hub
api_token = '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'
login(api_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [15]:
model_name = "meta-llama/Llama-2-7b-hf"

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config, device_map={"": 0})
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

## Tokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_eos_token=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## Load Model

In [12]:
dataset = load_dataset("musfiqdehan/preprocessed-BanglaNMT-sm")

Downloading readme:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/31.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/164084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20511 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20511 [00:00<?, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 164084
    })
    validation: Dataset({
        features: ['translations'],
        num_rows: 20511
    })
    test: Dataset({
        features: ['translations'],
        num_rows: 20511
    })
})

In [16]:
dataset['train']['translations'][9]

'শারিব আলী নামের একজন তরুণদের প্লাটফর্ম পুলইজওয়ানএ আয়োজন সম্পর্কে বলেছেন ###>Sharib Ali on the youth platform PulEJawan described the event\n'

In [10]:
from datasets import DatasetDict, Dataset
import pyarrow as pa

# Read the data from the file
with open('/kaggle/input/english-malayalam-valid-llama-fine/validation.txt', 'r', encoding='utf-8') as file:
    translations = file.readlines()

# Prepare data in the format expected by pyarrow
data = {
    'translations': translations
}

# Create a pyarrow Table
arrow_table = pa.Table.from_pydict(data)

# Create the Dataset object
dataset = Dataset(arrow_table)

# Create the DatasetDict object
dataset_dict = DatasetDict({
    'train': dataset
})

# Now 'dataset_dict' contains your data in the desired format
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 4875
    })
})


In [11]:
from datasets import DatasetDict, Dataset
import pyarrow as pa

# Read the data from the file
with open('/kaggle/input/test-llama-trans/test.txt', 'r', encoding='utf-8') as file:
    translations = file.readlines()

# Prepare data in the format expected by pyarrow
data = {
    'translations': translations
}

# Create a pyarrow Table
arrow_table = pa.Table.from_pydict(data)

# Create the Dataset object
dataset = Dataset(arrow_table)

# Create the DatasetDict object
dataset_dict2 = DatasetDict({
    'train': dataset
})

# Now 'dataset_dict' contains your data in the desired format
print(dataset_dict2)


DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 4903
    })
})


# LoRA Configuration

In [17]:
peft_config = LoraConfig(
            lora_alpha=16, 
            lora_dropout=0.05,
            r=16,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules= ["down_proj","up_proj","gate_proj"]
)

In [None]:
# peft_config = LoraConfig(
#             lora_alpha=16,
#             lora_dropout=0.05,
#             r=64,
#             bias="none",
#             task_type="CAUSAL_LM",
#             target_modules= ["q_proj","up_proj","o_proj","k_proj","down_proj","gate_proj","v_proj"]
# )

# Training Hyperparameters

In [18]:
training_arguments = TrainingArguments(
        output_dir="working/results/",
        evaluation_strategy="steps",
        optim="paged_adamw_8bit",
        save_steps=100,
        log_level="debug",
        logging_steps=100,
        learning_rate=1e-4,
        eval_steps=100,
        fp16=True,
        do_eval=True,
        per_device_train_batch_size=48,
        per_device_eval_batch_size=48,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        max_steps=500,
        lr_scheduler_type="linear"
)



# Training with TRL

In [14]:
!nvidia-smi

Thu Mar 14 01:45:46 2024       

+---------------------------------------------------------------------------------------+

| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |

|-----------------------------------------+----------------------+----------------------+

| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |

| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |

|                                         |                      |               MIG M. |


|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |

| N/A   36C    P0              32W / 250W |   5362MiB / 16384MiB |      0%      Default |

|                                         |                      |                  N/A |

+-----------------------------------------+----------------------+----------------------+

                                                        

In [19]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_dict_shuffled['train'],
        eval_dataset=dataset_dict_shuffled['validation'],
        peft_config=peft_config,
        dataset_text_field="translations",
        max_seq_length=48,
        tokenizer=tokenizer,
        args=training_arguments
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 48
***** Running training *****
  Num examples = 30,000
  Num Epochs = 2
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 2
  Total optimization steps = 500
  Number of trainable parameters = 23,199,744
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,2.3256,1.7884
200,1.7032,1.700193
300,1.6539,1.659043
400,1.5923,1.638666
500,1.576,1.627747



***** Running Evaluation *****
  Num examples = 2000
  Batch size = 48
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Saving model checkpoint to working/results/checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
 

TrainOutput(global_step=500, training_loss=1.7702031860351561, metrics={'train_runtime': 14411.0482, 'train_samples_per_second': 3.331, 'train_steps_per_second': 0.035, 'total_flos': 9.166063140864e+16, 'train_loss': 1.7702031860351561, 'epoch': 1.6})

# Inference: Translate with Llama 2

## Base Model SetUp

In [20]:
base_model = "meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model, device_map={"": 0}, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-

## Initialize Adapter (Fine-Tuned-Model)

In [21]:
# Fetched from Kaggle Output
model = PeftModel.from_pretrained(model, "working/results/checkpoint-500/")

In [None]:
# Uploaded to Hugging Face Model Hub
# model = PeftModel.from_pretrained(model, "musfiqdehan/Llama-2-7b-ft-mt-Bengali-to-English-sm")

# Testing Manually

In [25]:
my_text = "Kerala, a state on India's tropical Malabar Coast"

prompt = my_text+" #hi#>"

tokenized_input = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_input["input_ids"].cuda()

generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
)
for seq in generation_output.sequences:
    output = tokenizer.decode(seq, skip_special_tokens=True)
    print(output.split("#hi#>")[1].strip()) 

भारत की तटीय मलबार कोष्ठ


In [26]:
my_text = "Kerala, a state on India's tropical Malabar Coast"

prompt = my_text+" #ml#>"

tokenized_input = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_input["input_ids"].cuda()

generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
)
for seq in generation_output.sequences:
    output = tokenizer.decode(seq, skip_special_tokens=True)
    print(output.split("#ml#>")[1].strip()) 

ഇന്ത്യയിലെ മലബാർ കോ


In [34]:
def translator(text, language_code):
    prompt = text + " " + language_code
    tokenized_input = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized_input["input_ids"].cuda()

    generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
    )
    for seq in generation_output.sequences:
        output = tokenizer.decode(seq, skip_special_tokens=True)
        return output.split(language_code)[1].strip()

# Example usage
my_text = "Kerala, a state on India's tropical Malabar Coast"
translated_text_ml = translator(my_text, "#ml#>")
translated_text_hi = translator(my_text, "#hi#>")

In [36]:
translated_text_hi

'भारत की तटीय मलबार कोष्ठ'

In [35]:
translated_text_ml

'ഇന്ത്യയിലെ മലബാർ കോ'

In [37]:
def translate_texts(translator, dataset, language_code):
    tgt_texts, trans_texts = [], []

    for translation in dataset['translations']:
        src_text, tgt_text = translation.split(language_code)
        translated_text = translator(src_text.strip(), language_code)
        tgt_texts.append(tgt_text.strip())
        trans_texts.append(translated_text)

    return tgt_texts, trans_texts

In [40]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [46]:
!pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.0-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.10.0 sacrebleu-2.4.2


In [41]:
import evaluate
import torch

In [60]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Define compute_metrics function
def compute_metrics(decoded_preds, decoded_labels):
    # Load evaluation metric
    metric = evaluate.load("sacrebleu")
    # Post-process the decoded predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # Compile results
    result = {'bleu': result['score']}
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [61]:
metrics = compute_metrics(tgt_texts, trans_texts)

In [62]:
metrics

{'bleu': 1.3788}

In [64]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Define compute_metrics function
def compute_metrics(decoded_preds, decoded_labels):
    # Load evaluation metrics
    sacrebleu_metric = evaluate.load("sacrebleu")
    chrf_metric = evaluate.load("chrf")
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    sacrebleu_result = sacrebleu_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute CHRF score
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = {
        'bleu': sacrebleu_result['score'],
        'chrf': chrf_result['score']
    }
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [56]:
import numpy as np
import sacrebleu
import evaluate
from datasets import DatasetDict

# Load the BLEU metric from the evaluate library
metric = evaluate.load("bleu")

# Define the postprocess_text function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

# Define the compute_metrics function
def compute_metrics(tgt_texts, trans_texts):
    # Post-process the target and translated texts
    decoded_preds, decoded_labels = postprocess_text(trans_texts, tgt_texts)
    # Compute BLEU score using the metric
    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    # Create the result dictionary
    result = {'bleu': result['bleu']}
    # Round the results for readability
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [58]:
metrics = compute_metrics(tgt_texts, trans_texts)

In [59]:
metrics

{'bleu': 0.0003}

In [51]:
from nltk.translate.bleu_score import corpus_bleu

def bleu_score2(tgt_texts, trans_texts):
    # Prepare the target and translated texts for BLEU calculation
    tgt_texts2 = [[tgt_text.split()] for tgt_text in tgt_texts]
    trans_text2 = [translated_text.split() for translated_text in trans_texts]

    # Calculate BLEU scores for different n-gram weights
    bleu_dic = {}
    bleu_dic['1-grams'] = corpus_bleu(tgt_texts2, trans_text2, weights=(1.0, 0, 0, 0))
    bleu_dic['1-2-grams'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.5, 0.5, 0, 0))
    bleu_dic['1-3-grams'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.3, 0.3, 0.3, 0))
    bleu_dic['1-4-grams'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.25, 0.25, 0.25, 0.25))

    # Calculate the average BLEU score
    average_bleu = sum(bleu_dic.values()) / len(bleu_dic)
    bleu_dic['average'] = average_bleu

    return bleu_dic

def bleu_score3(tgt_texts, trans_texts):
    # Prepare the target and translated texts for BLEU calculation
    tgt_texts2 = [[tgt_text.split()] for tgt_text in tgt_texts]
    trans_text2 = [translated_text.split() for translated_text in trans_texts]

    # Calculate BLEU scores for different n-gram weights
    bleu_dic = {}
    bleu_dic['BLEU-1'] = corpus_bleu(tgt_texts2, trans_text2, weights=(1.0, 0, 0, 0))
    bleu_dic['BLEU-2'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.5, 0.5, 0, 0))
    bleu_dic['BLEU-3'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.33, 0.33, 0.34, 0))
    bleu_dic['BLEU-4'] = corpus_bleu(tgt_texts2, trans_text2, weights=(0.25, 0.25, 0.25, 0.25))

    # Calculate the average BLEU score as per the paper's formula
    average_bleu = (bleu_dic['BLEU-1'] + bleu_dic['BLEU-2'] + bleu_dic['BLEU-3'] + bleu_dic['BLEU-4']) / 4
    bleu_dic['BLEU_Avg'] = average_bleu * 100  # Multiplying by 100 to match the paper's scaling

    return bleu_dic


In [53]:
bleu_scores2 = bleu_score2(tgt_texts, trans_texts)
bleu_scores3 = bleu_score3(tgt_texts, trans_texts)

In [54]:
bleu_scores2

{'1-grams': 0.0008077470718690841,
 '1-2-grams': 0.000541608227878747,
 '1-3-grams': 0.00041137500109731333,
 '1-4-grams': 0.0002060647376368793,
 'average': 0.000491698759620506}

In [55]:
bleu_scores3

{'BLEU-1': 0.0008077470718690841,
 'BLEU-2': 0.000541608227878747,
 'BLEU-3': 0.0003462997304286147,
 'BLEU-4': 0.0002060647376368793,
 'BLEU_Avg': 0.04754299419533313}

In [43]:
tgt_texts, trans_texts = translate_texts(translator, dataset_dict_hi_test['test_hi'], "#hi#>")

In [48]:
metrics = compute_metrics(tgt_texts, trans_texts)

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [49]:
metrics

{'bleu': 1.3788, 'chrf': 23.6405}

In [63]:
tgt_textsML, trans_textsML = translate_texts(translator, dataset_dict_ml_test['test_ml'], "#ml#>")

In [65]:
metrics = compute_metrics(tgt_textsML, trans_textsML)

In [66]:
metrics

{'bleu': 0.2663, 'chrf': 20.0673}

In [67]:
import sacrebleu
import evaluate

# Load evaluation metrics
sacrebleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")
ter_metric = sacrebleu.metrics.TER()

# Define the postprocess_text function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

# Define the compute_metrics function
def compute_metrics(decoded_preds, decoded_labels):
    # Post-process the decoded predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    sacrebleu_result = sacrebleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = sacrebleu_result['score']

    # Compute CHRF score
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
    chrf_score = chrf_result['score']

    # Compute TER score using sacrebleu
    ter_score = ter_metric.corpus_score(decoded_preds, [decoded_labels]).score

    # Create the result dictionary
    result = {
        'bleu': round(bleu_score, 4),
        'chrf': round(chrf_score, 4),
        'ter': round(ter_score, 4)
    }
    
    return result

In [70]:
metrics = compute_metrics(tgt_texts, trans_texts)

In [71]:
metrics

{'bleu': 1.3788, 'chrf': 23.6405, 'ter': 688.4011}

In [68]:
metrics = compute_metrics(tgt_textsML, trans_textsML)

In [69]:
metrics

{'bleu': 0.2663, 'chrf': 20.0673, 'ter': 502.6419}

In [74]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Assuming you have your model and tokenizer already loaded and fine-tuned
model_name = "meta-llama/Llama-2-7b-hf"
base_model = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model, device_map={"": 0}, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

# Load the fine-tuned model
model = PeftModel.from_pretrained(model, "working/results/checkpoint-500/")

# Push the model to Hugging Face Hub
model.push_to_hub("FineTuned-Trans-oneTomany-llama-2-7b")
tokenizer.push_to_hub("FineTuned-Trans-oneTomany-llama-2-7b")


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-

adapter_model.safetensors:   0%|          | 0.00/92.8M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmp65moef8m/tokenizer_config.json
Special tokens file saved in /tmp/tmp65moef8m/special_tokens_map.json
Uploading the following files to ABHIiiii1/FineTuned-Trans-oneTomany-llama-2-7b: special_tokens_map.json,tokenizer.json,tokenizer_config.json,tokenizer.model,README.md


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ABHIiiii1/FineTuned-Trans-oneTomany-llama-2-7b/commit/5e4d3d208228f25924352f6a9de5cfdbd25cd12b', commit_message='Upload tokenizer', commit_description='', oid='5e4d3d208228f25924352f6a9de5cfdbd25cd12b', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
import torch

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer from the Hugging Face Hub
model_name = "ABHIiiii1/FineTuned-Trans-oneTomany-llama-2-7b"  # Replace with your model's path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode
model.eval()

def translator(text, language_code):
    prompt = text + " " + language_code
    tokenized_input = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized_input["input_ids"].to(device)

    generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
    )
    for seq in generation_output.sequences:
        output = tokenizer.decode(seq, skip_special_tokens=True)
        return output.split(language_code)[1].strip()

# Example usage
my_text = "Kerala, a state on India's tropical Malabar Coast"
translated_text_ml = translator(my_text, "#ml#>")
translated_text_hi = translator(my_text, "#hi#>")

tokenizer_config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/92.8M [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 20.12 MiB is free. Process 2159 has 15.87 GiB memory in use. Of the allocated memory 15.62 GiB is allocated by PyTorch, and 4.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Load the model and tokenizer from the Hugging Face Hub
model_name = "ABHIiiii1/FineTuned-Trans-llama-2-7b"  # Replace with your model's path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(model_name), model_name)

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to("cuda")

# Set the model to evaluation mode
model.eval()

# Prepare the input text
my_text = "Kerala, a state on India's tropical Malabar Coast"
prompt = my_text + " ###>"

# Tokenize the input text
tokenized_input = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_input["input_ids"].to(device)

# Generate the translation
generation_output = model.generate(
    input_ids=input_ids,
    num_beams=6,
    return_dict_in_generate=True,
    output_scores=True,
    max_new_tokens=130
)

# Decode and print the generated text
for seq in generation_output.sequences:
    output = tokenizer.decode(seq, skip_special_tokens=True)
    print(output.split("###>")[1].strip())


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/92.8M [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 192.12 MiB is free. Process 3238 has 15.71 GiB memory in use. Of the allocated memory 15.45 GiB is allocated by PyTorch, and 5.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF