In [None]:
# Install the necessary libraries
!pip install transformers
!pip install torch




In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import required libraries
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset
url = '/content/drive/MyDrive/callrecord/main-transcript.csv'  # Replace 'your_dataset_url.csv' with the URL to your CSV file
df = pd.read_csv(url)

# Display the first few rows of the dataset
df.head()

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4).to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and prepare data
def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, labels)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Transcription'].values,
    df['Label'].values,
    test_size=0.2,
    random_state=42
)

train_dataset = tokenize_data(train_texts, train_labels)
test_dataset = tokenize_data(test_texts, test_labels)

# Create DataLoader for training and testing sets
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

epochs = 50  # Adjust as needed

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")
# Save the trained model and tokenizer
output_model_path = '/content/drive/MyDrive/callrecord/model'  # Replace with the desired output directory
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

print(f"Model and tokenizer saved to: {output_model_path}")
# Evaluate the model
model.eval()
all_labels, all_preds = [], []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    preds = torch.argmax(logits, dim=1).cpu().numpy()
    all_labels.extend(labels.cpu().numpy())
    all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50, Loss: 1.356176495552063
Epoch 2/50, Loss: 1.2868170675478483
Epoch 3/50, Loss: 1.1458303928375244
Epoch 4/50, Loss: 1.020047454457534
Epoch 5/50, Loss: 0.8361816782700388
Epoch 6/50, Loss: 0.6451117553209004
Epoch 7/50, Loss: 0.4901450245003951
Epoch 8/50, Loss: 0.362275472597072
Epoch 9/50, Loss: 0.2532831720615688
Epoch 10/50, Loss: 0.17825087552007876
Epoch 11/50, Loss: 0.12021908164024353
Epoch 12/50, Loss: 0.1141074800765828
Epoch 13/50, Loss: 0.07997856073473629
Epoch 14/50, Loss: 0.09937445858591482
Epoch 15/50, Loss: 0.05549342724445619
Epoch 16/50, Loss: 0.07370025443991549
Epoch 17/50, Loss: 0.07129222625180294
Epoch 18/50, Loss: 0.06125858166304074
Epoch 19/50, Loss: 0.056535481279225724
Epoch 20/50, Loss: 0.03364257798775246
Epoch 21/50, Loss: 0.03893943508400729
Epoch 22/50, Loss: 0.05100270826369524
Epoch 23/50, Loss: 0.02012464409007838
Epoch 24/50, Loss: 0.02169159924807517
Epoch 25/50, Loss: 0.03336249250232389
Epoch 26/50, Loss: 0.031553611747528375
Epoch 

In [2]:
import torch
from transformers import BertForSequenceClassification

# Load the trained model
model_path = "/content/drive/MyDrive/callrecord/model"
model = BertForSequenceClassification.from_pretrained(model_path)


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example input text
input_text = "Hey, this is Greg. Hello? Yeah, and I'm Grayling with Roseville Hyundai. Can you hear me?."

# Tokenize the input text
tokens = tokenizer(input_text, return_tensors="pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Make predictions
with torch.no_grad():
    outputs = model(**tokens)

# Extract the predicted class probabilities or labels
predicted_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_label = torch.argmax(predicted_probs, dim=-1).item()

print(f"Predicted Label: {predicted_label}")
print(f"Predicted Probabilities: {predicted_probs.tolist()}")


Predicted Label: 3
Predicted Probabilities: [[0.010302656330168247, 0.001422324450686574, 0.0015508470823988318, 0.9867241978645325]]


In [5]:
# Map predicted label to class/category
class_mapping = {0: "Class 0", 1: "Class 1", 2: "Class 2",3: "Class 3"}
predicted_class = class_mapping[predicted_label]

print(f"Predicted Class: {predicted_class}")


Predicted Class: Class 3
