In [1]:
from docx import Document

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])
    return text


In [2]:
import pytesseract
from PIL import Image

def extract_text_from_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text


In [3]:
from transformers import pipeline

def extract_entities(text):
    nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
    entities = nlp(text)
    return {entity['entity']: entity['value'] for entity in entities}


In [13]:
from datasets import load_dataset
from transformers import BertForTokenClassification, BertTokenizerFast, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset('csv', data_files=r"C:\Users\sanab\Downloads\assignments\assignments\assignment1\data\train.csv")

# Tokenize the dataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
dataset = dataset.map(lambda examples: tokenizer(examples['Aggrement Value'] + ' ' + examples['Aggrement Start Date'] + ' ' + examples['Aggrement End Date'], truncation=True, padding=True), batched=True)

# Define the model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(dataset['train'].features['labels'].feature))

# Setup the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

TypeError: can only concatenate list (not "str") to list

In [16]:
from datasets import load_dataset
from transformers import BertForTokenClassification, BertTokenizerFast, DataCollatorForTokenClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset('csv', data_files=r"C:\Users\sanab\Downloads\assignments\assignments\assignment1\data\train.csv")

# Tokenize the dataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenized_datasets = dataset.map(
    lambda examples: tokenizer(examples['text'], truncation=True, padding=True),
    batched=True,
#)

# Ensure you have 'labels' in your dataset containing token-level labels
# If not, preprocess your dataset to include token-level labels

# Define the model
num_labels = 2  # Adjust based on your task
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Setup the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`