In [2]:
from transformers import BartForSequenceClassification, BartTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load BART model and tokenizer
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

In [4]:
import pandas as pd

df = pd.read_csv("sample_docs.csv")
df =df[["document","category"]]
df.head()

Unnamed: 0,document,category
0,troubled marsh under sec scrutiny the us stock...,paydown
1,us insurer marsh cuts 2 500 jobs up to 2 500 j...,paydown
2,japan bank shares up on link talk shares of su...,paydown_intrest
3,ge sees excellent world economy us behemoth ...,paydown
4,news corp eyes video games market news corp t...,paydown_intrest


In [5]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

df.head()

Unnamed: 0,document,category
0,troubled marsh under sec scrutiny the us stock...,0
1,us insurer marsh cuts 2 500 jobs up to 2 500 j...,0
2,japan bank shares up on link talk shares of su...,1
3,ge sees excellent world economy us behemoth ...,0
4,news corp eyes video games market news corp t...,1


In [6]:
# Create a dictionary to map encoded numerical values to categories
encoded_to_category = {index: category for index, category in enumerate(label_encoder.classes_)}

print(encoded_to_category)

{0: 'paydown', 1: 'paydown_intrest', 2: 'paydown_only'}


In [7]:
X = df["document"].to_list()
y = df["category"].to_list()

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Prepare your dataset
# Assuming you have your data in X (text) and y (labels) format
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
num_labels = len(set(y_train))

# Prepare your dataset
# Assuming you have your data in X (text) and y (labels) format
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
num_labels = len(set(y_train))


In [9]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")

# Tokenize and encode your training and validation data
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)

In [10]:
# Convert the tokenized encodings into PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train)
)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(y_val)
)

In [11]:
# Define training parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Create DataLoader for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()




In [None]:
# Training loop
for epoch in range(epochs):
    # Training
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_preds.extend(logits.argmax(dim=1).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

    # Calculate validation accuracy
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}")



In [None]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


# Type2

In [None]:


import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from datasets import Dataset
from torch.utils.data import DataLoader  # Import DataLoader from PyTorch

# Load dataset from CSV
df = pd.read_csv("sample_docs.csv")  # Adjust the file path accordingly

# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Define batch size and number of epochs
batch_size = 8
num_epochs = 4

# Load zero-shot classification pipeline
classifier = pipeline("zero-shot-classification")


labels = dataset.unique("category")

# Fine-tune zero-shot model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define data loader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs = tokenizer(batch["document"], padding=True, truncation=True, return_tensors="pt")
        labels_ids = torch.tensor([labels.index(label) for label in batch["category"]])  # Convert labels to indices
        outputs = model(**inputs, labels=labels_ids)  # Use label indices instead of tokenized labels
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Fine-tuned model
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)



No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
# Example usage
text = "sample text"
candidate_labels = labels
result = classifier(text, candidate_labels)

print(result)