<a href="https://colab.research.google.com/github/sennatitcomb/contact-deeplearning/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers pandas torch



In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from tqdm import tqdm

In [3]:
# Load the dataset
df = pd.read_csv("bigshuffle.csv")

# Assuming your labels are in the "Labels" column
labels = df["Labels"].values

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# Concatenate the columns into a single string for each row
text_data = df["First Name"] + " " + df["Last Name"] + " " + df["Email"] + " " + df["Phone"] + " " + df["Title"]

# Tokenize the text data
tokenized_inputs = tokenizer(
    text_data.tolist(),  # Convert to list of strings
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

# Create a PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_inputs["input_ids"][idx],
            "attention_mask": self.tokenized_inputs["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx]),
        }

dataset = CustomDataset(tokenized_inputs, labels)

# Split the dataset into training and testing sets
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

In [4]:
# Model configuration
model_config = BertConfig.from_pretrained("bert-base-uncased", num_labels=2)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=model_config)

# Training parameters
batch_size = 8
epochs = 3
learning_rate = 2e-5

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} Training"):
        inputs = {key: value.to(device) for key, value in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

# Evaluation loop
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        inputs = {key: value.to(device) for key, value in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on the test set: {accuracy:.2%}")

# Save the model
model.save_pretrained("your_model_directory")

# Save the tokenizer
tokenizer.save_pretrained("your_model_directory")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 62/62 [05:58<00:00,  5.78s/it]
Epoch 2 Training: 100%|██████████| 62/62 [05:46<00:00,  5.58s/it]
Epoch 3 Training: 100%|██████████| 62/62 [05:37<00:00,  5.44s/it]
Testing: 100%|██████████| 16/16 [00:23<00:00,  1.44s/it]


Accuracy on the test set: 100.00%


('your_model_directory/tokenizer_config.json',
 'your_model_directory/special_tokens_map.json',
 'your_model_directory/vocab.txt',
 'your_model_directory/added_tokens.json')

In [5]:
# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("your_model_directory")
loaded_tokenizer = BertTokenizer.from_pretrained("your_model_directory")

# Example of using the model for predictions on new data
new_data = pd.DataFrame({
    "First Name": ["John"],
    "Last Name": ["Doe"],
    "Email": ["john.doe@example.com"],
    "Phone": ["123-456-7890"],
    "Title": ["Data Scientist"]
})

# Concatenate the columns into a single string for each row in the new data
new_text_data = new_data["First Name"] + " " + new_data["Last Name"] + " " + new_data["Email"] + " " + new_data["Phone"] + " " + new_data["Title"]

# Tokenize the new data
tokenized_new_data = loaded_tokenizer(
    new_text_data.tolist(),  # Convert to list of strings
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

# Make predictions
with torch.no_grad():
    inputs = {key: value.to(device) for key, value in tokenized_new_data.items()}
    outputs = loaded_model(**inputs)
    _, predicted_label = torch.max(outputs.logits, dim=1)


print(f"Predicted Label: {predicted_label.item()}")


Predicted Label: 0


In [6]:
# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("your_model_directory")
loaded_tokenizer = BertTokenizer.from_pretrained("your_model_directory")

# Example of using the model for predictions on new data
new_data = pd.DataFrame({
    "First Name": ["@@&839@+7&"],
    "Last Name": ["@@&839@+7&"],
    "Email": ["@@&839@+7&"],
    "Phone": ["@@&839@+7&"],
    "Title": ["@@&839@+7&"]
})

# Tokenize each column separately
tokenized_first_name = loaded_tokenizer(new_data["First Name"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
tokenized_last_name = loaded_tokenizer(new_data["Last Name"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
tokenized_email = loaded_tokenizer(new_data["Email"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
tokenized_phone = loaded_tokenizer(new_data["Phone"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
tokenized_title = loaded_tokenizer(new_data["Title"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

# Make predictions
with torch.no_grad():
    inputs = {
        "input_ids": tokenized_first_name["input_ids"].to(device),
        "attention_mask": tokenized_first_name["attention_mask"].to(device),
        # Add other tokenized tensors similarly
    }
    outputs = loaded_model(**inputs)
    _, predicted_label = torch.max(outputs.logits, dim=1)

print(f"Predicted Label: {predicted_label.item()}")


Predicted Label: 1


In [7]:
import pandas as pd

# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("your_model_directory")
loaded_tokenizer = BertTokenizer.from_pretrained("your_model_directory")

# Function to process and predict labels for a CSV file
def process_csv_file(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Tokenize each column separately
    tokenized_first_name = loaded_tokenizer(df["First Name"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    tokenized_last_name = loaded_tokenizer(df["Last Name"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    tokenized_email = loaded_tokenizer(df["Email"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    tokenized_phone = loaded_tokenizer(df["Phone"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    tokenized_title = loaded_tokenizer(df["Title"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Make predictions
    with torch.no_grad():
        inputs = {
            "input_ids": tokenized_first_name["input_ids"].to(device),
            "attention_mask": tokenized_first_name["attention_mask"].to(device),
            # Add other tokenized tensors similarly
        }
        outputs = loaded_model(**inputs)
        _, predicted_labels = torch.max(outputs.logits, dim=1)

    # Display results
    df["Predicted Label"] = predicted_labels.cpu().numpy()
    print(df)

# Example usage
csv_file_path = "testdata.csv"
process_csv_file(csv_file_path)

    First Name   Last Name                          Email               Title  \
0         Axel        Howe        axel.howe@bilearner.com  Area Sales Manager   
1       Milton        Wall      milton.wall@bilearner.com  Area Sales Manager   
2         Cory    Robinson    cory.robinson@bilearner.com  Area Sales Manager   
3       Saniya          Yu        saniya.yu@bilearner.com  Area Sales Manager   
4        Alisa       James      alisa.james@bilearner.com  Area Sales Manager   
5      Lincoln     Compton  lincoln.compton@bilearner.com  Area Sales Manager   
6       Aliana       Nolan     aliana.nolan@bilearner.com  Area Sales Manager   
7         Axel        Howe        axel.howe@bilearner.com  Area Sales Manager   
8       Milton        Wall      milton.wall@bilearner.com  Area Sales Manager   
9         Cory    Robinson    cory.robinson@bilearner.com  Area Sales Manager   
10      Saniya          Yu        saniya.yu@bilearner.com  Area Sales Manager   
11       Alisa       James  