# ADSP 32017 -  Assignment 3

### Savita K. Gupta
### 23 October 2023

## **Question 1**

### **1.0 Import/Examine Data**

In [1]:
from datasets import load_dataset
import pandas as pd

language = load_dataset("papluca/language-identification")

In [2]:
language

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [3]:
train_ds = language["train"]
train_ds

Dataset({
    features: ['labels', 'text'],
    num_rows: 70000
})

In [4]:
len(train_ds)

70000

In [5]:
train_ds.column_names

['labels', 'text']

### **1.1 Create Label IDs (ints) for labels**

Use Label Encoder to create integer IDs for each language label.
Why: The "labels" for the current dataset are strings. 

**"Train" LabelIDs**

In [6]:
langLabels1 = language["train"]["labels"]

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(langLabels1)

labelIDs1 = le.transform(langLabels1)

print(labelIDs1)

[12  1 19 ... 17  5  7]


In [8]:
new_column1 = labelIDs1
language['train'] = language['train'].add_column("labelID", new_column1)

print(language["train"].features)
print(language['train'][:2])

{'labels': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labelID': Value(dtype='int64', id=None)}
{'labels': ['pt', 'bg'], 'text': ['os chefes de defesa da estónia, letónia, lituânia, alemanha, itália, espanha e eslováquia assinarão o acordo para fornecer pessoal e financiamento para o centro.', 'размерът на хоризонталната мрежа може да бъде по реда на няколко километра ( km ) за на симулация до около 100 km за на симулация .'], 'labelID': [12, 1]}


**"Validation" LabelIDs**

In [9]:
langLabels2 = language["validation"]["labels"]

le.fit(langLabels2)
labelIDs2 = le.transform(langLabels2)

print(labelIDs2)

[10 10  5 ... 18  1 11]


In [10]:
new_column2 = labelIDs2
language['validation'] = language['validation'].add_column("labelID", new_column2)

print(language["validation"].features)
print(language['validation'][:2])

{'labels': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labelID': Value(dtype='int64', id=None)}
{'labels': ['nl', 'nl'], 'text': ['"Ik ken geen druk," zei Mr. Feith, de ondersecretaris van defensie voor beleid.', 'Hier is mijn advies op basis van mijn persoonlijke ervaring met het afkolven van moedermelk voor tweelingjongens voor een heel jaar.'], 'labelID': [10, 10]}


**"Test" LabelIDs**

In [11]:
langLabels3 = language["test"]["labels"]

le.fit(langLabels3)
labelIDs3 = le.transform(langLabels3)

print(labelIDs3)

[10 10  5 ... 18  1 11]


In [12]:
new_column3 = labelIDs3
language['test'] = language['test'].add_column("labelID", new_column3)

print(language['test'].features)
print(language['test'][:2])

{'labels': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labelID': Value(dtype='int64', id=None)}
{'labels': ['nl', 'nl'], 'text': ['Een man zingt en speelt gitaar.', 'De technologisch geplaatste Nasdaq Composite Index .IXIC daalde met 25,36 punten, of 1,53 procent, tot 1.628,26.'], 'labelID': [10, 10]}


### **1.2 Tokenization**

**Setup**

In [13]:
import torch
import torch.nn.functional as F

from transformers import AutoTokenizer

model_lang = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_lang)

In [14]:
tokenizer.vocab_size

250002

In [15]:
tokenizer.model_max_length

512

In [16]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [34]:
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))

data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

Unnamed: 0,0,1,2,3,4
Special Token,<s>,<pad>,</s>,<unk>,<mask>
Special Token ID,0,1,2,3,250001


**Tokenization**

In [17]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [19]:
lang_encoded = language.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

In [20]:
print(lang_encoded["train"].column_names)

['labels', 'text', 'labelID', 'input_ids', 'attention_mask']


### **1.3 Encoder Model**

**Import Model**

In [21]:
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_lang).to(device)

**My Processing Function:**

In [26]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

**Set Format to Tensor**

In [27]:
lang_encoded.set_format("torch", 
                            columns=["labelID","input_ids","attention_mask"])

### **1.4 Extract Hidden States**

In [28]:
torch.cuda.empty_cache() 

In [29]:
lang_hidden = lang_encoded.map(extract_hidden_states, batched=True, batch_size=100)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [31]:
lang_hidden["train"].column_names

['labels', 'text', 'labelID', 'input_ids', 'attention_mask', 'hidden_state']

### **1.5 Train Text Classifier (Simple)**

In [30]:
import numpy as np

X_train = np.array(lang_hidden["train"]["hidden_state"])
X_valid = np.array(lang_hidden["validation"]["hidden_state"])
y_train = np.array(lang_hidden["train"]["labelID"])
y_valid = np.array(lang_hidden["validation"]["labelID"])
X_train.shape, X_valid.shape

((70000, 768), (10000, 768))

In [32]:
# We increase `max_iter` to guarantee convergence 
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Q1 Performance**

In [33]:
lr_clf.score(X_valid, y_valid)

0.988

## **Question 2**

Instead of using the [CLS] token to represent the whole sentence, I am going to try and use the **last token** in the embedding to represent the sentence.

### **2.1 Tokenization**

In [35]:
#Fresh tokenization, unformated.

lang2_encoded = language.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [36]:
print(lang2_encoded["train"].column_names)

['labels', 'text', 'labelID', 'input_ids', 'attention_mask']


### **2.2 Encoder Model**

In [37]:
def extract_hidden_states2(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for last token
    return {"hidden_state": last_hidden_state[:,-1].cpu().numpy()}

In [39]:
lang2_encoded.set_format("torch", 
                            columns=["labelID","input_ids","attention_mask"])

### **2.3 Extract Hidden States**

In [40]:
torch.cuda.empty_cache() 

In [41]:
lang2_hidden = lang2_encoded.map(extract_hidden_states2, batched=True, batch_size=100)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [42]:
lang_hidden["train"].column_names

['labels', 'text', 'labelID', 'input_ids', 'attention_mask', 'hidden_state']

### **2.4 Train Text Classifier (Simple)**

In [43]:
import numpy as np

X_train2 = np.array(lang2_hidden["train"]["hidden_state"])
X_valid2 = np.array(lang2_hidden["validation"]["hidden_state"])
y_train2 = np.array(lang2_hidden["train"]["labelID"])
y_valid2 = np.array(lang2_hidden["validation"]["labelID"])
X_train2.shape, X_valid2.shape

((70000, 768), (10000, 768))

In [44]:
# We increase `max_iter` to guarantee convergence 
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train2, y_train2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Q2 Performance**

In [47]:
lr_clf.score(X_valid2, y_valid2)

0.9902

#### **Results**

#### The Q2 classifier performed better than the Q1 classifier by 0.0022.