In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wiki-articles-2/cleaned_file.csv
/kaggle/input/cc-matrix/ccmatrix_ur_en_train.csv
/kaggle/input/opus-dataset/val.csv
/kaggle/input/opus-dataset/train.csv
/kaggle/input/opus-dataset/test.csv


In [2]:
import pandas as pd

def csv_to_txt(csv_files, output_txt):
    """
    Convert multiple CSV files into a single TXT file.

    Args:
    csv_files (list of str): List of paths to the input CSV files.
    output_txt (str): Path for the output TXT file.
    """
    try:
        with open(output_txt, 'w', encoding='utf-8') as txt_file:
            for csv_file in csv_files:
                # Read the CSV file
                df = pd.read_csv(csv_file)

                # Check if the DataFrame has exactly two columns
                if df.shape[1] != 2:
                    raise ValueError(f"The CSV file {csv_file} must contain exactly two columns.")

                # Write each row to the TXT file
                for _, row in df.iterrows():
                    urdu_text = row.iloc[0]  # First column (Urdu) using iloc
                    english_text = row.iloc[1]  # Second column (English) using iloc

                    # Write the line to the TXT file
                    # Adjust separator if needed (e.g., '\t' for tab or ' ' for space)
                    txt_file.write(f"{urdu_text}\t{english_text}\n")

        print(f"Successfully created {output_txt} with combined data from the CSV files.")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
csv_files = [
    '/kaggle/input/cc-matrix/ccmatrix_ur_en_train.csv',   # Path to first CSV file
    '/kaggle/input/opus-dataset/train.csv',  # Path to second CSV file
    '/kaggle/input/opus-dataset/val.csv',
    '/kaggle/input/wiki-articles-2/cleaned_file.csv',# Path to third CSV file
    '/kaggle/input/opus-dataset/test.csv'
]
output_txt_path = '/kaggle/working/combined_data.txt'  # Output TXT file path

csv_to_txt(csv_files, output_txt_path)


Successfully created /kaggle/working/combined_data.txt with combined data from the CSV files.


In [3]:
line_count = 0
with open("/kaggle/working/combined_data.txt", 'r') as file:
    for line in file:
        line_count += 1
print(f"Number of lines: {line_count}")


Number of lines: 7036078


## Tokenizer Training

In [4]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import Unigram
from tokenizers.normalizers import Sequence, NFKD, StripAccents, Replace
from tokenizers.pre_tokenizers import Metaspace
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import Metaspace as MetaspaceDecoder
from transformers import PreTrainedTokenizerFast
import torch

class Normalizer:
    def __init__(self):
        self.normalizer = Sequence([
            Replace("``", '"'),
            Replace("''", '"'),
            NFKD(),
            StripAccents(),
            Replace(r" {2,}", " ")
        ])

class PreTokenizer:
    def __init__(self):
        self.pre_tokenizer = Metaspace()

class PostProcessor:
    def __init__(self, tokenizer):
        # The tokenizer instance is passed to access token IDs after training.
        self.tokenizer = tokenizer
        self.post_processor = None  # Will be initialized in the setup method.

    def setup(self):
        # Set up the post-processor after the tokenizer is trained
        self.post_processor = TemplateProcessing(
            single="<s>:0 $A:0 </s>:1",
            pair="<s>:0 $A:0 </s>:1 $B:1 </s>:2",
            special_tokens=[
                ("<s>", self.tokenizer.token_to_id("<s>")),
                ("</s>", self.tokenizer.token_to_id("</s>")),
            ]
        )

class Decoder:
    def __init__(self):
        self.decoder = MetaspaceDecoder()

class TokenizerTrainer:
    def __init__(self, vocab_size=50000, special_tokens=["<unk>", "<pad>", "<s>", "</s>"]):
        self.vocab_size = vocab_size
        self.special_tokens = special_tokens

    def train(self, tokenizer, corpus_file):
        trainer = trainers.UnigramTrainer(
            vocab_size=self.vocab_size,
            special_tokens=self.special_tokens,
            unk_token="<unk>"
        )
        tokenizer.train_from_iterator(self.get_training_corpus(corpus_file), trainer=trainer)

    @staticmethod
    def get_training_corpus(corpus_file):
        with open(corpus_file, 'r', encoding='utf-8') as f:
            for line in f:
                yield line.strip()

class UnigramTokenizer:
    def __init__(self, vocab_size=32000, corpus_file='vocab.txt'):
        self.tokenizer = Tokenizer(Unigram())
        self.wrapped_tokenizer = None

        # Initialize components
        self.normalizer = Normalizer()
        self.pre_tokenizer = PreTokenizer()
        self.trainer = TokenizerTrainer(vocab_size)
        self.post_processor = PostProcessor(self.tokenizer)
        self.decoder = Decoder()

        self._initialize_tokenizer(corpus_file)

    def _initialize_tokenizer(self, corpus_file):
        # Set up the tokenizer components
        self.tokenizer.normalizer = self.normalizer.normalizer
        self.tokenizer.pre_tokenizer = self.pre_tokenizer.pre_tokenizer
        self.trainer.train(self.tokenizer, corpus_file)

        # Set up the post-processor after training the tokenizer
        self.post_processor.setup()
        self.tokenizer.post_processor = self.post_processor.post_processor

        # Set up the decoder
        self.tokenizer.decoder = self.decoder.decoder

        # Wrap in PreTrainedTokenizerFast
        self.wrapped_tokenizer = PreTrainedTokenizerFast(
            tokenizer_object=self.tokenizer,
            bos_token="<s>",
            eos_token="</s>",
            unk_token="<unk>",
            pad_token="<pad>",
            padding_side="right",
        )

    def encode(self, text_samples, max_length=512, return_tensors='pt'):
        """
        Encodes a batch of text samples.

        Args:
            text_samples (list or str): A list of sentences or a single sentence to be encoded.
            max_length (int): Maximum length of the output sequences.
            return_tensors (str): The type of tensor to return ('pt' for PyTorch, 'tf' for TensorFlow).

        Returns:
            Tensor: Encoded token IDs as a tensor of specified type.
        """
        if isinstance(text_samples, str):
            text_samples = [text_samples]  # Convert to a list if a single sentence is passed.

        # Encode the batch of sentences
        encoded = self.wrapped_tokenizer(
            text_samples,
            return_tensors=return_tensors,
            padding='max_length',
            truncation=True,
            max_length=max_length,
        )
        
        return encoded['input_ids']  # Return only the input IDs

    def decode(self, encoded_batch, skip_special_tokens=True):
        """
        Decodes a batch of encoded token IDs.

        Args:
            encoded_batch (Tensor): A tensor of encoded token IDs.
            skip_special_tokens (bool): Whether to skip special tokens during decoding.

        Returns:
            list: A list of decoded sentences.
        """
        # Decode each encoded sentence in the batch
        decoded_batch = [self.wrapped_tokenizer.decode(encoded, skip_special_tokens=skip_special_tokens) for encoded in encoded_batch]
        return decoded_batch

    def save(self, directory):
        self.wrapped_tokenizer.save_pretrained(directory)




In [5]:
# Example usage
if __name__ == "__main__":
    tokenizer = UnigramTokenizer(corpus_file='/kaggle/working/combined_data.txt')

    # Test batch of sentences
    test_sentences = [
        "The sun sets in the west, painting the sky orange and pink.",
        "She enjoys long walks on the beach during sunset.",
        "I love to run."
    ]

    # Batch encode
    encoded_batch = tokenizer.encode(test_sentences, max_length=100, return_tensors='tf')
    print(f"Encoded batch: {encoded_batch}")

    # Batch decode
    decoded_batch = tokenizer.decode(encoded_batch, skip_special_tokens=False)
    print(f"Decoded batch: {decoded_batch}")

    # Save the tokenizer
    tokenizer.save("./32k_tokenizer")








Encoded batch: [[    2   295  1275  6528    23     5  8075     9 19789     5  2053 26010
     10     4   965 16970    11     3     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1]
 [    2  4120  2108    33   948 12525    71     5 14399  1305 19812    11
      3     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1     1     1     1     1     1     1   

## Tokenizer

In [None]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the saved directory
tokenizer = PreTrainedTokenizerFast.from_pretrained("/kaggle/input/tokenizer")

# Now you can use the loaded tokenizer to encode/decode text
test_sentences = [
    "The sun sets in the west, painting the sky orange and pink.",
    "She enjoys long walks on the beach during sunset.",
    "I love to run."
]

# Batch encode
encoded_batch = tokenizer(test_sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
print(f"Encoded batch: {encoded_batch['input_ids']}")

# Batch decode
decoded_batch = [tokenizer.decode(encoded, skip_special_tokens=False) for encoded in encoded_batch['input_ids']]
print(f"Decoded batch: {decoded_batch}")


In [None]:
with open("/kaggle/working/combined_data.txt", 'r') as file:
    sentences = file.readlines()

In [None]:
sentences[:-10]