<a href="https://colab.research.google.com/github/saparbayev-azizbek-12/bi-and-ai-talents-dl/blob/main/lesson-16/Copy_of_Conv_For_Sentence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentence Classification Examples:

    1. Sentiment Analysis
        1.1 Positive
        1.2 Negative
        I really hate this music -> Negative -> 0
        I love this food -> Positive -> 1
    2. Jumla qaysi tilda yozilganini aniqlash
    3. Names dataset classification
    


In [1]:
%%capture
!pip install unidecode

In [2]:
import torch
from google.colab import drive

drive.mount("/drive")

Mounted at /drive


In [3]:
from unidecode import unidecode

In [4]:
!ls /drive/MyDrive/data/names

ls: cannot access '/drive/MyDrive/data/names': No such file or directory


In [5]:
!cat /drive/MyDrive/data/names/Arabic.txt

cat: /drive/MyDrive/data/names/Arabic.txt: No such file or directory


In [6]:
import os
from glob import glob

In [7]:
root_dir = "/drive/MyDrive/names"
file_names = glob("*.txt", root_dir=root_dir)
unique_labels = sorted([os.path.splitext(file_name)[0] for file_name in file_names])
n_labels = len(unique_labels)

idx2label = {idx:label for idx, label in enumerate(unique_labels)}
label2idx = {label:idx for idx, label in idx2label.items()}

In [8]:
def replace(name, chars, target):
    for char in chars:
        name = name.replace(char, target)
    return name

In [9]:
X_names = []
Y_labels = []

for file_name in file_names:
    with open(os.path.join(root_dir, file_name), "rt", encoding='utf-8') as f:
        for line in f:
            name = line.strip().lower()
            name = unidecode(name)

            if name == 'to the first page':
                continue

            name = replace(name, [",", '1', "/b", ":", "\xa0"], '')
            name = replace(name, ['-'], ' ')

            X_names.append(name)
            Y_labels.append(os.path.splitext(file_name)[0])

1. Remove "To The First Page" names from dataset
2. Replace ",", '1', "/B", ":", \xa0 with empty string
3. Replace '-' with ' '
4. Convert all the following from unicode to ascii:
[ 'ß',
 'à',
 'á',
 'ã',
 'ä',
 'ç',
 'è',
 'é',
 'ê',
 'ì',
 'í',
 'ñ',
 'ò',
 'ó',
 'õ',
 'ö',
 'ù',
 'ú',
 'ü',
 'ą',
 'ł',
 'ń',
 'ś',
 'ż']

In [10]:
pad_token = '.'
pad_token_id = 0

unique_chars = [pad_token] + sorted(set(''.join(X_names)))
idx2char = {idx:char for idx, char in enumerate(unique_chars)}
char2idx = {char:idx for idx, char in idx2char.items()}

def encode(name: str) -> list[int]:
    return [char2idx[char] for char in name]

def decode(ids: list[int]) -> str:
    return ''.join(idx2char[i] for i in ids)

In [11]:
Y = [label2idx[label] for label in Y_labels]
X = [encode(name) for name in X_names]

1. Split data into train and test
2. NamesDataset for both train and test
3. Data Loader for both train and test with custom `collate` function

In [12]:
# 1. Split data into train and test
from sklearn.model_selection import train_test_split

X_train, Y_train, X_test, Y_test = train_test_split(
    X, Y, train_size=0.8, random_state=4, stratify=Y
)

In [51]:
# 2. NamesDataset for both train and test

import torch
from torch.utils.data import Dataset, DataLoader

class NamesDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {
            "input_id": torch.tensor(self.X[idx]),
            "label": torch.tensor(self.Y[idx])
        }

train_dataset = NamesDataset(X_train, Y_train)
test_dataset = NamesDataset(X_test, Y_test)

In [52]:
# 3. Data Loader for both train and test with custom `collate` function
def collate(batch):
    print("working...")
    input_ids = [item['input_id'] for item in batch]
    labels = torch.stack([item['label'] for item in batch])

    max_len = max([len(input) for input in input_ids])
    print("Max len:", max_len)

    # Pad using pad_sequence function
    # input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)

    # Pad manually
    input_ids_padded = []
    max_len = max(len(row) for row in batch)
    for row in batch:
        if len(row) < max_len:
          row += [pad_token_id]*(max_len-len(row))
          input_ids_padded.append(row)

        return {
            'input_ids': input_ids_padded,
            'labels': labels
        }

batch_size = 4

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate)


{'input_id': tensor([ 3,  4,  4, 20,  3,  5,  5, 11,  3,  4,  7, 16, 11]), 'label': tensor([ 4, 20,  3, 16,  6, 20, 11,  5, 13])}
{'input_id': tensor([21, 22, 17, 20, 20]), 'label': tensor([ 5,  3, 15, 18, 17])}
{'input_id': tensor([ 5, 20, 17, 25,  7]), 'label': tensor([21,  3, 15,  3, 10,  3])}
{'input_id': tensor([ 9, 11, 21, 15, 17, 16,  6, 11]), 'label': tensor([22, 17, 23, 15,  3])}
{'input_id': tensor([27,  3, 15,  4, 23, 20,  9]), 'label': tensor([10,  3, 14,  8, 17, 20,  6])}
{'input_id': tensor([21, 11, 15, 17, 16, 11, 21]), 'label': tensor([21, 11,  3,  9, 27, 17])}
{'input_id': tensor([ 3, 21,  8, 17, 23, 20]), 'label': tensor([10, 17, 20, 13, 11, 16])}
{'input_id': tensor([27,  3, 16,  3, 14, 17, 24]), 'label': tensor([ 5, 10,  3, 15, 23, 21, 10,  7, 24])}
{'input_id': tensor([ 5, 10,  7, 21, 10, 11, 20,  7]), 'label': tensor([21, 10, 11,  9,  3])}
{'input_id': tensor([ 6, 27,  3, 13, 17, 24]), 'label': tensor([21,  3, 20, 20,  3,  8])}
{'input_id': tensor([15,  3, 22, 21,

In [40]:
train_dataloader.batch_size

4

In [19]:
def collate_fn(x):
    x_padded = []
    max_len = max([len(r) for r in x])
    for row in x:
      x_padded.append(row + (max_len - len(row))*[0])

    print(x_padded)


In [20]:
x = (
    [5, 23, 16, 16, 11, 16, 9, 10, 3, 15],
    [3, 9, 3, 18, 17, 8, 8],
    [15, 17, 6, 28, 3, 14, 7, 24, 21, 13, 27],
    [27, 23, 10, 15, 3]
)
print(collate_fn(x))

[[5, 23, 16, 16, 11, 16, 9, 10, 3, 15, 0], [3, 9, 3, 18, 17, 8, 8, 0, 0, 0, 0], [15, 17, 6, 28, 3, 14, 7, 24, 21, 13, 27], [27, 23, 10, 15, 3, 0, 0, 0, 0, 0, 0]]
None
