### Import modules

In [3]:
from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification
import numpy as np 
import pandas as pd
import torch

### Load tokenizer

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained("./Commonlit-RoBERTa-Base/tokenizer")

file ./Commonlit-RoBERTa-Base/tokenizer\config.json not found


### Load model

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("./Commonlit-RoBERTa-Base")

### Display sample data from excerpts CSV

In [6]:
# Change file name according to which csv to use
gutenberg_df = pd.read_csv('./Gutenberg-Excerpts/gutenberg-excerpts-3.csv') 
gutenberg_df

Unnamed: 0,book,author,url,excerpt
0,Pride and Prejudice,Jane Austen,https://www.gutenberg.org/files/1342/1342-h/13...,"“And yours,” he replied with a smile, “is will..."
1,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,https://www.gutenberg.org/files/84/84-h/84-h.htm,My present situation was one in which all volu...
2,Alice's Adventures in Wonderland,Lewis Carroll,https://www.gutenberg.org/files/11/11-h/11-h.htm,“One side of what? The other side of what?” th...
3,The Adventures of Sherlock Holmes,Arthur Conan Doyle,https://www.gutenberg.org/files/1661/1661-h/16...,“And how have you succeeded?” “Well.” “You hav...
4,"Moby Dick; Or, The Whale",Herman Melville,https://www.gutenberg.org/files/2701/2701-h/27...,Now comes the Baling of the Case. But to compr...
...,...,...,...,...
95,Candide,Voltaire,https://www.gutenberg.org/files/19942/19942-h/...,"""Yet again!"" said Cunegonde, ""now there is no ..."
96,The Elements of Style,William Strunk,https://www.gutenberg.org/files/37134/37134-h/...,15. But though they had been victorious in the...
97,A Pickle for the Knowing Ones,Timothy Dexter,https://www.gutenberg.org/files/43453/43453-h/...,The follering peases are not my Riting but ver...
98,The History of the Peloponnesian War,Thucydides,https://www.gutenberg.org/ebooks/7142.html.images,The battle was fought at Tanagra in Boeotia. A...


### Compute predictions based on a range of excerpt CSV files and save predictions as CSV

In [7]:
# Maximum length for token sequence
MAX_LENGTH = 256

# Iterate through each excerpt file
for i in range (6,7):
    # Load excerpt file into DataFrame
    gutenberg_df = pd.read_csv('./Gutenberg-Excerpts/gutenberg-excerpts-' + str(i) + '.csv') 
    # Convert excerpts to list
    excerpt_list = gutenberg_df['excerpt'].tolist()
    # Initialise predictions list
    preds_list = []

    # Repeat through each excerpt and compute predictions
    for j in range(len(excerpt_list)):
        token_seq = tokenizer(excerpt_list[j], padding = "max_length", max_length=MAX_LENGTH, truncation=True, return_tensors="pt")
        preds = model(**token_seq).logits[0].item()
        preds_list.append(preds)
    
    # Create dataframe from the predictions in order to convert to CSV
    gutenberg_preds = pd.DataFrame({'book': gutenberg_df['book'],
                                'author': gutenberg_df['author'],
                                'url': gutenberg_df['url'],
                                'excerpt': gutenberg_df['excerpt'],
                                'target': preds_list})

    # Save predictions as CSV
    gutenberg_preds.to_csv('./Gutenberg-Predictions/gutenberg-predictions-' + str(i) + '.csv', index = False, encoding = 'utf-8-sig')