In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'tense-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3846929%2F6692061%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240415%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240415T125043Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D85508354677c701d10fa42644c865eeeb67c80ddb35a8757c9740509721fa829ab4a060e09a84c825571117724038d4d9ffe977de1aef4d039b451a747c75e278da0c2c2fb4b2a0aeff486047e57ea60f478862803a208b10460ec392046ac84eccb6754d32231a8927ac0803e630a13fb819e6fe2864067c812034ea054070c1a6a582a138a5a250bafc4272d217eb53b9863045f5990836bea485248cef878bc4ff717b6d6fb0b367a50751924e4eb91bd50d1ec66fec2c6433f2c0f25329a7b509b21a8617d8389c9315001b99a7b5332978588a3d268d9d3e70cd8422b5bbba411e8919f80cfff7430a73df38ac07a82fb7fe75fd2b8682f660db979e748'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading tense-dataset, 29970 bytes compressed
Downloaded and uncompressed: tense-dataset
Data source import complete.


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from torch.utils.data import DataLoader, TensorDataset

# BERT Pre-train Model

In [3]:
# define BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
data = pd.read_csv("/kaggle/input/tense-dataset/tense.csv", encoding='latin-1')
data.columns = ["sentence", "tense"]
data

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present
...,...,...
3111,The kids aren't playing in the yard,present continuous
3112,They weren't talking on the phone when I called,past continuous
3113,I won't go to the store after work,future
3114,She doesn't study French every evening,present


# Preprocessing Data

In [5]:
tense_labels = {
    'present': 0,
    'future': 1,
    'past': 2,
    'present perfect continuous': 3,
    'future perfect': 4,
    'past perfect': 5,
    'future continuous': 6,
    'past perfect continuous': 7,
    'present continuous': 8,
    'past continuous': 9,
    'future perfect continuous': 10,
    'present perfect': 11,
}

In [6]:
for item in data["tense"]:
    if item not in tense_labels:
        print(f"Tense value '{item}' not found in tense_labels dictionary.")

In [7]:
def clean_and_map_tense(tense):
    if pd.notna(tense):
        tense = tense.lower()
        if tense in tense_labels:
            return tense
    return None

# Clean and map tenses
data["tense"] = data["tense"].apply(clean_and_map_tense)

# Remove rows with None (unrecognized tenses)
data = data.dropna()

data

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present
...,...,...
3111,The kids aren't playing in the yard,present continuous
3112,They weren't talking on the phone when I called,past continuous
3113,I won't go to the store after work,future
3114,She doesn't study French every evening,present


In [9]:
# Check and remove unrecognized tenses from tense_labels
unrecognized_tenses = [item for item in data["tense"] if item not in tense_labels]
for unrecognized_tense in unrecognized_tenses:
    del tense_labels[unrecognized_tense]

In [10]:
missing_values = data.isnull().sum()
print("Nan Data:")
print(missing_values)

Nan Data:
sentence    0
tense       0
dtype: int64


# Train data slipt

In [11]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

def prepare_input_data(data):
    encoded_data = tokenizer(data["sentence"].tolist(), padding=True, truncation=True, return_tensors="pt")
    return encoded_data

In [12]:
train_inputs = prepare_input_data(train_data)
test_inputs = prepare_input_data(test_data)

train_labels = torch.tensor([tense_labels[item] for item in train_data["tense"]])
test_labels = torch.tensor([tense_labels[item] for item in test_data["tense"]])


# Define NN model

- **Model architecture**

![image.png](attachment:55688b0d-8c41-46b6-b3f4-923725bff542.png)

In [13]:
class TenseClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(TenseClassifier, self).__init__()
        self.bert = bert_model
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        x = self.relu1(pooled_output)
        x = self.relu2(x)
        logits = self.fc(x)
        return logits

num_classes = 12
model = TenseClassifier(bert_model, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [14]:
# Define batch size
batch_size = 32

# Create DataLoader for training data
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train model

In [15]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_data_loader)}')

Epoch 1/10, Average Loss: 1.3814015208910673
Epoch 2/10, Average Loss: 0.31155478209257126
Epoch 3/10, Average Loss: 0.16615669114085344
Epoch 4/10, Average Loss: 0.11926886563499768
Epoch 5/10, Average Loss: 0.08816039972962478
Epoch 6/10, Average Loss: 0.07383102044845238
Epoch 7/10, Average Loss: 0.06926920202871163
Epoch 8/10, Average Loss: 0.06780576369223687
Epoch 9/10, Average Loss: 0.05270191809783379
Epoch 10/10, Average Loss: 0.048702311319991566


In [16]:
# Evaluation
model.eval()
with torch.no_grad():
    logits = model(test_inputs['input_ids'], test_inputs['attention_mask'])
    predicted_labels = torch.argmax(logits, dim=1)
    accuracy = accuracy_score(test_labels, predicted_labels)
    print(f'Accuracy on test set: {accuracy * 100:.2f}%')

Accuracy on test set: 96.79%


In [19]:
# Save model state_dict
torch.save(model.state_dict(), 'tense_classifier_model.pth')

# Save tokenizer
tokenizer.save_pretrained('path_to_save_tokenizer')

('path_to_save_tokenizer/tokenizer_config.json',
 'path_to_save_tokenizer/special_tokens_map.json',
 'path_to_save_tokenizer/vocab.txt',
 'path_to_save_tokenizer/added_tokens.json')

In [21]:
def predict_tense(sentence, model, tokenizer, tense_labels):
    # tokenizer
    encoded_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        logits = model(encoded_sentence['input_ids'], encoded_sentence['attention_mask'])
        predicted_label = torch.argmax(logits, dim=1).item()

    predicted_tense = [k for k, v in tense_labels.items() if v == predicted_label][0]

    return predicted_tense

sentence_to_predict = """
 I had walked to school yesterday..
"""
predicted_tense = predict_tense(sentence_to_predict, model, tokenizer, tense_labels)
print(f"The predicted tense for the sentence is: {predicted_tense}")

The predicted tense for the sentence is: past perfect
