In [None]:
# Setting the version number for this configuration
VER = 14

# Loading tokens from a specified directory
LOAD_TOKENS_FROM = '../input/tf-longformer-v12'

# Loading the model from a specified directory
LOAD_MODEL_FROM = '../input/tflongformerv14'

# Specifying the path where the downloaded model will be saved
DOWNLOADED_MODEL_PATH = '../input/tf-longformer-v12'


### Import Libraries

In [2]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import *

In [3]:
# Enabling automatic mixed precision for better GPU memory utilization
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

# Loading the tokenizer from the downloaded model path
tokenizer = AutoTokenizer.from_pretrained(DOWNLOADED_MODEL_PATH)

In [4]:
# Specifying the maximum length for tokens
MAX_LEN = 1024

# Loading target data from the specified directory and file for the given maximum length
targets = np.load(f'{LOAD_TOKENS_FROM}/targets_{MAX_LEN}.npy')

# Loading training tokens from the specified directory and file for the given maximum length
train_tokens = np.load(f'{LOAD_TOKENS_FROM}/tokens_{MAX_LEN}.npy')

# Loading attention data from the specified directory and file for the given maximum length
train_attention = np.load(f'{LOAD_TOKENS_FROM}/attention_{MAX_LEN}.npy')

# Printing a message to indicate that NER tokens have been loaded
print('Loaded NER tokens')

Loaded NER tokens


In [76]:
# Define a function to build the model
def build_model():
    # Define inputs for tokens and attention masks with the specified shape and data type
    tokens = tf.keras.layers.Input(shape=(MAX_LEN,), name='tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention', dtype=tf.int32)
    
    # Load the model configuration from the downloaded model's config.json file
    config = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH + '/config.json') 
    
    # Load the pre-trained model using the loaded configuration
    backbone = TFAutoModel.from_pretrained(DOWNLOADED_MODEL_PATH + '/tf_model.h5', config=config)
    
    # Pass the inputs through the backbone model
    x = backbone(tokens, attention_mask=attention)
    
    # Apply a dense layer with ReLU activation
    x = tf.keras.layers.Dense(256, activation='relu')(x[0])
    
    # Apply another dense layer with softmax activation for classification
    x = tf.keras.layers.Dense(15, activation='softmax', dtype='float32')(x)
    
    # Create the final model using inputs and outputs
    model = tf.keras.Model(inputs=[tokens, attention], outputs=x)
    
    # Compile the model with specified optimizer, loss function, and metrics
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-4),
                  loss=[tf.keras.losses.CategoricalCrossentropy()],
                  metrics=[tf.keras.metrics.CategoricalAccuracy()])
    
    return model

tf.keras.utils.get_custom_objects()["swish"] = tf.keras.activations.swish
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [77]:
# Creating an instance of the model using the build_model() function
model = build_model()

In [78]:
# Loading pre-trained weights into the model from the specified path
model.load_weights('/kaggle/input/tflongformerv14/long_v14.h5')

In [79]:
# Creating arrays to store test tokens and attention masks with zeros
# The shape of the arrays is (1, MAX_LEN), where MAX_LEN is the specified maximum token length
test_tokens = np.zeros((1, MAX_LEN), dtype='int32')
test_attention = np.zeros((1, MAX_LEN), dtype='int32')


In [80]:
# READ TRAIN TEXT, TOKENIZE, AND SAVE IN TOKEN ARRAYS    
n = "0FB0700DAF44"
#name = f'../input/feedback-prize-2021/test/{n}.txt'

text_to_write = '''While it may be true that the Mason City government ought to devote more money to riverside recreational facilities, this author's argument does not make a cogent case for increased resources based on river use. It is easy to understand why city residents would want a cleaner river, but this argument is rife with holes and assumptions, and thus, not strong enough to lead to increased funding.
Citing surveys of city residents, the author reports city resident's love of water sports. It is not clear, however, the scope and validity of that survey. For example, the survey could have asked residents if they prefer using the river for water sports or would like to see a hydroelectric dam built, which may have swayed residents toward river sports. The sample may not have been representative of city residents, asking only those residents who live upon the river. The survey may have been 10 pages long, with 2 questions dedicated to river sports. We just do not know. Unless the survey is fully representative, valid, and reliable, it can not be used to effectively back the author's argument.
Additionally, the author implies that residents do not use the river for swimming, boating, and fishing, despite their professed interest, because the water is polluted and smelly. While a polluted, smelly river would likely cut down on river sports, a concrete connection between the resident's lack of river use and the river's current state is not effectively made. Though there have been complaints, we do not know if there have been numerous complaints from a wide range of people, or perhaps from one or two individuals who made numerous complaints. To strengthen his/her argument, the author would benefit from implementing a normed survey asking a wide range of residents why they do not currently use the river.
Building upon the implication that residents do not use the river due to the quality of the river's water and the smell, the author suggests that a river clean up will result in increased river usage. If the river's water quality and smell result from problems which can be cleaned, this may be true. For example, if the decreased water quality and aroma is caused by pollution by factories along the river, this conceivably could be remedied. But if the quality and
aroma results from the natural mineral deposits in the water or surrounding rock, this may not be true. There are some bodies of water which emit a strong smell of sulphur due to the geography of the area. This is not something likely to be affected by a clean-up. Consequently, a river clean up may have no impact upon river usage. Regardless of whether the river's quality is able to be improved or not, the author does not effectively show a connection between water quality and river usage.
A clean, beautiful, safe river often adds to a city's property values, leads to increased tourism and revenue from those who come to take advantage of the river, and a better overall quality of life for residents. For these reasons, city government may decide to invest in improving riverside recreational facilities.
However, this author's argument is not likely significantly persuade the city goverment to allocate increased funding.
'''
# Tokenizing the input text and saving tokens and attention masks in arrays
tokens = tokenizer.encode_plus(
    text_to_write,              # Input text to tokenize
    max_length=MAX_LEN,         # Maximum length of tokens
    padding='max_length',       # Pad tokens to the maximum length
    truncation=True,            # Truncate tokens if needed
    return_offsets_mapping=True # Return offsets mapping for future reference
)

# Storing the tokenized input and attention mask in the test token arrays
test_tokens[0,] = tokens['input_ids']       # Store tokenized input
test_attention[0,] = tokens['attention_mask']  # Store attention mask

In [81]:
# Using the trained model to make predictions on the test tokens and attention masks
p = model.predict(
    [test_tokens, test_attention],  # Input data for prediction
    batch_size=16,                  # Batch size for inference
    verbose=2                       # Display progress information
)

# Printing the shape of the predictions array
print('Test predictions shape:', p.shape)

# Finding the indices of the highest predicted values along the last axis
test_preds = np.argmax(p, axis=-1)

# Returning the array of predicted labels
test_preds


1/1 - 28s
Test predictions shape: (1, 1024, 15)


array([[ 2,  3,  3, ..., 14, 14, 14]])

In [82]:
# Creating a reverse mapping of target labels for interpretation
target_map_rev = {
    0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement',
    5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'
}

# Initializing a list to store all predictions
all_predictions = []

txt = text_to_write
tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                       truncation=True, return_offsets_mapping=True)

# Extracting offset mappings to determine word positions
off = tokens['offset_mapping']

# Determining word positions in characters
w = []
blank = True
for i in range(len(txt)):
    if (txt[i] != ' ') & (txt[i] != '\n') & (txt[i] != '\xa0') & (txt[i] != '\x85') & (blank == True):
        w.append(i)
        blank = False
    elif (txt[i] == ' ') | (txt[i] == '\n') | (txt[i] == '\xa0') | (txt[i] == '\x85'):
        blank = True

w.append(1e6)

# Mapping from tokens to words using offset mappings
word_map = -1 * np.ones(MAX_LEN, dtype='int32')
w_i = 0

for i in range(len(off)):
    if off[i][1] == 0:
        continue
    while off[i][0] >= w[w_i + 1]:
        w_i += 1
    word_map[i] = int(w_i)

# Processing token predictions and mapping to words
pred = test_preds[0,] / 2.0

i = 0
while i < MAX_LEN:
    prediction = []
    start = pred[i]
    if start in [0, 1, 2, 3, 4, 5, 6, 7]:
        prediction.append(word_map[i])
        i += 1
        if i >= MAX_LEN:
            break
        while pred[i] == start + 0.5:
            if word_map[i] not in prediction:
                prediction.append(word_map[i])
            i += 1
            if i >= MAX_LEN:
                break
    else:
        i += 1
    prediction = [x for x in prediction if x != -1]
    if len(prediction) > 4:
        all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))

# Creating a DataFrame from the collected predictions
df = pd.DataFrame(all_predictions)
df.columns = ['id', 'discourse_type', 'predictionstring']


In [83]:
# Extracting the 'id' column from the DataFrame
id_num = df['id']

# Initializing variables for tracking
counter = 0
last_char_index = -1
discourse_start = []
discourse_end = []
discourse_text = []
discourse_id = []

# Looping through each 'id' in the DataFrame
for ids in id_num:
    first_prediction_string = df['predictionstring'].iloc[counter]

    text_no = first_prediction_string.split(' ')
    text_start = int(text_no[0])
    text_end = int(text_no[-1])

    words = txt.split()

    # Extracting the text within the predicted discourse span
    text = words[text_start:text_end + 1]
    text2 = words[0:text_start]

    # Calculating the character index for the start of the discourse span
    current_word_char = len(" ".join(text2))
    if current_word_char != 0:
        current_word_char += 1
    start_char_index = 0 + current_word_char

    # Calculating the character index for the end of the discourse span
    total_chars = len(" ".join(text))
    last_char_index = start_char_index + total_chars

    # Appending the calculated values to respective lists
    discourse_text.append(" ".join(text))
    discourse_start.append(start_char_index)
    discourse_end.append(last_char_index)
    discourse_id.append(counter)
    counter += 1

# Adding the calculated values as new columns to the DataFrame
df['discourse_start'] = discourse_start
df['discourse_end'] = discourse_end
df['discourse_text'] = discourse_text
df['discourse_id'] = discourse_id
df

Unnamed: 0,id,discourse_type,predictionstring,discourse_start,discourse_end,discourse_text,discourse_id
0,0FB0700DAF44,Position,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,0,395,While it may be true that the Mason City gover...,0
1,0FB0700DAF44,Claim,67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 8...,396,551,"Citing surveys of city residents, the author r...",1
2,0FB0700DAF44,Claim,186 187 188 189 190 191 192 193 194 195 196 19...,1099,1279,"Additionally, the author implies that resident...",2
3,0FB0700DAF44,Evidence,213 214 215 216 217 218 219 220 221 222 223 22...,1280,1819,"While a polluted, smelly river would likely cu...",3
4,0FB0700DAF44,Claim,303 304 305 306 307 308 309 310 311 312 313 31...,1820,2020,Building upon the implication that residents d...,4
5,0FB0700DAF44,Evidence,339 340 341 342 343 344 345 346 347 348 349 35...,2021,2390,If the river's water quality and smell result ...,5
6,0FB0700DAF44,Evidence,403 404 405 406 407 408 409 410 411 412 413 41...,2391,2630,There are some bodies of water which emit a st...,6


# Classification

In [55]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
import datasets
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import log_loss
import torch.nn.functional as F

In [37]:
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [38]:
# Defining a function to calculate the log loss score for predictions
def score(preds):
    # Calculate the log loss using the label ids and softmax predictions
    logloss = log_loss(preds.label_ids, F.softmax(torch.Tensor(preds.predictions)))
    
    # Return the log loss score
    return {'log loss': logloss}

In [39]:
model_nm = '../input/classification-debert-model'

In [56]:
# Importing necessary libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

# Load the trained model and tokenizer from the specified model name or path
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_nm)
loaded_tokz = AutoTokenizer.from_pretrained(model_nm)

# Define the score function for evaluation
def score(preds):
    logloss = log_loss(preds.label_ids, F.softmax(torch.Tensor(preds.predictions)))
    return {'log loss': logloss}

# Set up the Trainer for prediction using the loaded model and tokenizer
loaded_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokz, compute_metrics=score)


In [57]:
sep = loaded_tokz.sep_token
sep

'[SEP]'

In [58]:
# Defining a function for tokenization using the loaded tokenizer
def tok_func(x):
    # Tokenize the input using the loaded tokenizer with truncation
    return loaded_tokz(x["inputs"], truncation=True)


In [59]:
df

Unnamed: 0,id,discourse_type,predictionstring,discourse_start,discourse_end,discourse_text,discourse_id,inputs
0,0FB0700DAF44,Position,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,0,395,While it may be true that the Mason City gover...,0,Position[SEP]While it may be true that the Mas...
1,0FB0700DAF44,Claim,67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 8...,396,551,"Citing surveys of city residents, the author r...",1,"Claim[SEP]Citing surveys of city residents, th..."
2,0FB0700DAF44,Claim,186 187 188 189 190 191 192 193 194 195 196 19...,1099,1279,"Additionally, the author implies that resident...",2,"Claim[SEP]Additionally, the author implies tha..."
3,0FB0700DAF44,Evidence,213 214 215 216 217 218 219 220 221 222 223 22...,1280,1819,"While a polluted, smelly river would likely cu...",3,"Evidence[SEP]While a polluted, smelly river wo..."
4,0FB0700DAF44,Claim,303 304 305 306 307 308 309 310 311 312 313 31...,1820,2020,Building upon the implication that residents d...,4,Claim[SEP]Building upon the implication that r...
5,0FB0700DAF44,Evidence,339 340 341 342 343 344 345 346 347 348 349 35...,2021,2390,If the river's water quality and smell result ...,5,Evidence[SEP]If the river's water quality and ...
6,0FB0700DAF44,Evidence,403 404 405 406 407 408 409 410 411 412 413 41...,2391,2630,There are some bodies of water which emit a st...,6,Evidence[SEP]There are some bodies of water wh...


In [44]:
df['inputs'] = df.discourse_type + sep + df.discourse_text

In [60]:
# Importing necessary libraries
from datasets import Dataset

# Defining a function to create a tokenized dataset
def get_dds(df, train=True):
    # Creating a Dataset object from the provided DataFrame
    ds = Dataset.from_pandas(df)
    
    # Columns to remove from the dataset before tokenization
    to_remove = ['discourse_text', 'discourse_type', 'inputs', 'discourse_id', 'id', 'predictionstring', 'discourse_start', 'discourse_end']
    
    # Applying the tokenization function to the dataset using map
    tok_ds = ds.map(tok_func, batched=True, remove_columns=to_remove)
    
    # Returning the tokenized dataset
    return tok_ds


In [61]:
# Creating a tokenized dataset for testing using the provided function
loaded_test_ds = get_dds(df, train=False)

# Using the loaded trainer to predict labels and getting softmax predictions
loaded_predictions = loaded_trainer.predict(loaded_test_ds)
loaded_softmax_preds = F.softmax(torch.Tensor(loaded_predictions.predictions)).numpy().astype(float)

# Storing the softmax predictions
loaded_preds = loaded_softmax_preds


  0%|          | 0/1 [00:00<?, ?ba/s]

In [62]:
# Creating a new DataFrame to store the final results
final_df = pd.DataFrame()

# Copying the relevant columns from the original DataFrame to the new DataFrame
final_df['id'] = df['id']
final_df['discourse_type'] = df['discourse_type']
final_df['discourse_start'] = df['discourse_start']
final_df['discourse_end'] = df['discourse_end']

# Adding columns for prediction probabilities for each class
final_df['Ineffective'] = loaded_preds[:, 0]  # Probability for class 'Ineffective'
final_df['Adequate'] = loaded_preds[:, 1]      # Probability for class 'Adequate'
final_df['Effective'] = loaded_preds[:, 2]     # Probability for class 'Effective'


In [65]:
# Function to determine the highest effectiveness level for a row
def get_effectiveness(row):
    # Find the maximum effectiveness probability among the classes
    max_effectiveness = max(row['Ineffective'], row['Adequate'], row['Effective'])
    
    # Check which class has the maximum probability and return the corresponding label
    if row['Ineffective'] == max_effectiveness:
        return f"{row['discourse_type']} - Ineffective"
    elif row['Adequate'] == max_effectiveness:
        return f"{row['discourse_type']} - Adequate"
    else:
        return f"{row['discourse_type']} - Effective"

# Apply the get_effectiveness function to create the "Effectiveness" column
final_df['Effectiveness'] = final_df.apply(get_effectiveness, axis=1)


In [66]:
final_df

Unnamed: 0,id,discourse_type,discourse_start,discourse_end,Ineffective,Adequate,Effective,Effectiveness
0,0FB0700DAF44,Position,0,395,0.275402,0.379183,0.345415,Position - Adequate
1,0FB0700DAF44,Claim,396,551,0.275179,0.378909,0.345913,Claim - Adequate
2,0FB0700DAF44,Claim,1099,1279,0.275024,0.379236,0.34574,Claim - Adequate
3,0FB0700DAF44,Evidence,1280,1819,0.275675,0.378131,0.346194,Evidence - Adequate
4,0FB0700DAF44,Claim,1820,2020,0.275519,0.378799,0.345682,Claim - Adequate
5,0FB0700DAF44,Evidence,2021,2390,0.27552,0.378569,0.34591,Evidence - Adequate
6,0FB0700DAF44,Evidence,2391,2630,0.275595,0.378632,0.345773,Evidence - Adequate


In [70]:
colors = {
            'Lead - Ineffective': '#8000ff',
            'Lead - Adequate': '#8000ff',
            'Lead - Effective': '#8000ff',
            'Position - Ineffective': '#2b7ff6',
            'Position - Adequate': '#2b7ff6',
            'Position - Effective': '#2b7ff6',
            'Evidence - Ineffective': '#2adddd',
            'Evidence - Adequate': '#2adddd',
            'Evidence - Effective': '#2adddd',
            'Claim - Ineffective': '#80ffb4',
            'Claim - Adequate': '#80ffb4',
            'Claim - Effective': '#80ffb4',
            'Concluding Statement - Ineffective': 'd4dd80',
            'Concluding Statement - Adequate': 'd4dd80',
            'Concluding Statement - Effective': 'd4dd80',
            'Counterclaim - Ineffective': '#ff8042',
            'Counterclaim - Adequate': '#ff8042',
            'Counterclaim - Effective': '#ff8042',
            'Rebuttal - Ineffective': '#ff0000',
            'Rebuttal - Adequate': '#ff0000',
            'Rebuttal - Effective': '#ff0000'
         }

# Defining a function to visualize labeled spans in text
def visualize(example):
    ents = []
    
    # Iterating through rows of the final_df DataFrame for the specified example
    for i, row in final_df[final_df['id'] == example].iterrows():
        ents.append({
            'start': int(row['discourse_start']),
            'end': int(row['discourse_end']),
            'label': row['Effectiveness']
        })

    # Assuming txt contains the text for visualization
    data = txt
    
    # Creating a document structure for visualization
    doc2 = {
        "text": data,
        "ents": ents,
        "title": 'Argument Essay:'
    }

    # Defining options for visualization, including labels and colors
    options = {"ents": final_df.Effectiveness.unique().tolist(), "colors": colors}
    
    # Rendering the entity visualization using spaCy's displacy
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [72]:
from spacy import displacy

# Get the list of example IDs from the final_df DataFrame
examples = final_df['id'].values.tolist()

# Create a set to keep track of visualized IDs
visualized_ids = set()

# Loop through each example ID
for ex in examples:
    # Check if the ID hasn't been visualized yet
    if ex not in visualized_ids:
        # Call the visualize function to create the visualization
        visualize(ex)
        print('\n')  # Print a newline for separation
        # Add the visualized ID to the set
        visualized_ids.add(ex)




