### Import modules

In [1]:
from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification
import numpy as np 
import pandas as pd
import torch

### Load tokenizer

In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained("./Commonlit-RoBERTa-Base/tokenizer")

file ./Commonlit-RoBERTa-Base/tokenizer\config.json not found


### Load model

In [3]:
model = AutoModelForSequenceClassification.from_pretrained("./Commonlit-RoBERTa-Base")

### Display sample data from excerpts CSV

In [4]:
# Change file name according to which csv to use
gutenberg_df = pd.read_csv('./Gutenberg-Excerpts/gutenberg-excerpts-3.csv') 
gutenberg_df

Unnamed: 0,book,author,url,excerpt
0,Pride and Prejudice,Jane Austen,https://www.gutenberg.org/files/1342/1342-h/13...,"Mr. Gardiner, whose manners were very easy and..."
1,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,https://www.gutenberg.org/files/84/84-h/84-h.htm,"Henry wished to dissuade me, but seeing me ben..."
2,Alice's Adventures in Wonderland,Lewis Carroll,https://www.gutenberg.org/files/11/11-h/11-h.htm,"“I thought it would,” said the Cat, and vanish..."
3,The Adventures of Sherlock Holmes,Arthur Conan Doyle,https://www.gutenberg.org/files/1661/1661-h/16...,“‘Do you desire your name to be kept upon the ...
4,"Moby Dick; Or, The Whale",Herman Melville,https://www.gutenberg.org/files/2701/2701-h/27...,"“Stand not by me, but stand under me, whoever ..."
...,...,...,...,...
95,Persuasion,Jane Austen,https://www.gutenberg.org/files/105/105-h/105-...,"""I never want them, I assure you. They talk an..."
96,Complete Original Short Stories of Guy De Maup...,Guy de Maupassant,https://www.gutenberg.org/files/3090/3090-h/30...,"“If you do not obey, I shall smash the lock. I..."
97,The Jungle,Upton Sinclair,https://www.gutenberg.org/files/140/140-h/140-...,"“Yes, but this don’t wash.” “What is it?” “Fer..."
98,The Elements of Style,William Strunk,https://www.gutenberg.org/files/37134/37134-h/...,The same is true of colloquialisms and slang. ...


### Compute predictions based on a range of excerpt CSV files and save predictions as CSV

In [6]:
# Maximum length for token sequence
MAX_LENGTH = 256

# Iterate through each excerpt file
for i in range (1,11):
    # Load excerpt file into DataFrame
    gutenberg_df = pd.read_csv('./Gutenberg-Excerpts/gutenberg-excerpts-' + str(i) + '.csv') 
    # Convert excerpts to list
    excerpt_list = gutenberg_df['excerpt'].tolist()
    # Initialise predictions list
    preds_list = []

    # Repeat through each excerpt and compute predictions
    for j in range(len(excerpt_list)):
        token_seq = tokenizer(excerpt_list[j], padding = "max_length", max_length=MAX_LENGTH, truncation=True, return_tensors="pt")
        preds = model(**token_seq).logits[0].item()
        preds_list.append(preds)
    
    # Create dataframe from the predictions in order to convert to CSV
    gutenberg_preds = pd.DataFrame({'book': gutenberg_df['book'],
                                'author': gutenberg_df['author'],
                                'url': gutenberg_df['url'],
                                'excerpt': gutenberg_df['excerpt'],
                                'target': preds_list})

    # Save predictions as CSV
    gutenberg_preds.to_csv('./Gutenberg-Predictions/gutenberg-predictions-' + str(i) + '.csv', index = False, encoding = 'utf-8-sig')