In [2]:
import pandas as pd

def load_data(file_path, delimiter=','):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(line.strip().split(delimiter))
            except Exception as e:
                print(f"Error processing line: {line}. Error: {e}")
    return pd.DataFrame(data)

# Load dev and test data
dev_data = load_data('dev-0/in_1.csv', delimiter=',')
dev_labels = load_data('dev-0/expected.tsv', delimiter='\t')
test_data = load_data('test-A/in_1.csv', delimiter=',')


In [5]:
dev_data['combined'] = dev_data.iloc[:, 0] + ' [MASK] ' + dev_data.iloc[:, 1]
test_data['combined'] = test_data.iloc[:, 0] + ' [MASK] ' + test_data.iloc[:, 1]

In [7]:
from transformers import pipeline

# Initialize the fill-mask pipeline
fill_mask = pipeline('fill-mask', model='bert-base-uncased')

# Function to get the top predicted word with error handling
def get_prediction(text):
    try:
        return fill_mask(text)[0]['token_str']
    except Exception as e:
        print(f"Error processing text: {text}\nError message: {e}")
        return '[ERROR]'

# Apply the prediction function to dev and test data
dev_data['predicted_word'] = dev_data['combined'].apply(get_prediction)
test_data['predicted_word'] = test_data['combined'].apply(get_prediction)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


Error processing text: The Poor ye hare with yon Daring the discussion that has been going on for the past two years as to the propriety of introducing labor to such an extent that it will very materially reduce wages it seems as though we bad forgotten the There is a very large number of people in every community who must labor for wages They have not the for handling money only in very moderate sums and are content to earn a living through life and dying leave their families the heritage of a good name and moderate education There are less of this class in the Pacific States than any other part of the country but in old they form a majority of the Now for whose benefit is this Govern being run Are we in our to add to the wealth of those who are already rich enough to forget that the poor are with us that they are in the majority that it is this we should strive to raise up that the rich are too rich the too We were raised in the belief this was a seeking the greatest good of the grea

In [8]:
dev_data[['predicted_word']].to_csv('dev-0/out1.tsv', sep='\t', index=False, header=False)
test_data[['predicted_word']].to_csv('test-A/out1.tsv', sep='\t', index=False, header=False)
