#### Load data from google drive

In [1]:
#This method that import file from google drive is generated by gpt.

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
link = 'https://drive.google.com/file/d/1XKcwl1xDcV3PwErWuA47ZPPnqupayxFo/view?usp=drive_link'

In [3]:
file_id = '1XKcwl1xDcV3PwErWuA47ZPPnqupayxFo'

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('subset_result.csv')

In [4]:
import pandas as pd
df = pd.read_csv('subset_result.csv')

In [5]:
df_2 = df[['item_1A', 'high_7']]

In [6]:
df_2.head()

Unnamed: 0,item_1A,high_7
0,"Item 1A. Risk Factors\nThe following factors, ...",0
1,ITEM 1A.\nRISK FACTORS\nRisks Related to Our L...,1
2,Item 1A. Risk Factors\n.\n” for a discussion o...,1
3,ITEM 1A. RISK FACTORS\nOur forward-looking sta...,0
4,Item 1A. RISK FACTORS\nItem 1A. Risk Factors i...,1


### Randomly shuffle data

In [7]:
df_shuffled = df_2.sample(frac=1.0, random_state=42)

In [8]:
df_shuffled.head()

Unnamed: 0,item_1A,high_7
521,ITEM 1A. RISK FACTORS\nAn investment in our se...,1
737,ITEM 1A.\nRISK FACTORS\nWe caution you that th...,0
740,Item 1A. Risk Factors\nYou should carefully co...,1
660,ITEM 1A. RISK FACTORS\nItem 1A. Risk factors\n...,1
411,ITEM 1A - RISK FACTORS\nYou should carefully c...,0


### Split train and test

In [9]:
df_train = df_2.iloc[:800]
df_test = df_2.iloc[801:1000]

In [10]:
df_train.loc[:, 'item_1A'] = df_train['item_1A'].astype(str)
df_test.loc[:, 'item_1A'] = df_test['item_1A'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:, 'item_1A'] = df_train['item_1A'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, 'item_1A'] = df_test['item_1A'].astype(str)


### Tokenize data, and import model

In [11]:
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

#compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

#tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

#tokenize function
def tokenize_and_format(df):
    tokenized_texts = tokenizer(df['item_1A'].tolist(), padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(df['high_7'].values)
    return tokenized_texts, labels


#tokenize training and testing data
train_tokenized, train_labels = tokenize_and_format(df_train)
test_tokenized, test_labels = tokenize_and_format(df_test)

#create TensorDataset
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], test_labels)

#model
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=2)


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install transformers[torch]

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [15]:
import accelerate
import transformers
print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)

Accelerate version: 0.25.0
Transformers version: 4.35.2


### Set parameters

In [16]:
from transformers import TrainingArguments, Trainer
#training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=1e-12
)

def custom_data_collator(features):
    input_ids = torch.stack([f[0] for f in features])
    attention_masks = torch.stack([f[1] for f in features])
    labels = torch.stack([f[2] for f in features])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


#trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator
)


### Train model

In [17]:
trainer.train()

Step,Training Loss
10,0.7035
20,0.7073
30,0.6976
40,0.6936
50,0.6892
60,0.6944
70,0.6949
80,0.7017
90,0.6879
100,0.6978


TrainOutput(global_step=600, training_loss=0.6989584334691366, metrics={'train_runtime': 573.4197, 'train_samples_per_second': 4.185, 'train_steps_per_second': 1.046, 'total_flos': 788221933977600.0, 'train_loss': 0.6989584334691366, 'epoch': 3.0})

### Check model accuracy

In [18]:
trainer.evaluate()

{'eval_loss': 0.7011431455612183,
 'eval_accuracy': 0.4020100502512563,
 'eval_runtime': 13.8485,
 'eval_samples_per_second': 14.37,
 'eval_steps_per_second': 3.61,
 'epoch': 3.0}

In [19]:
predictions_output = trainer.predict(test_dataset)

In [20]:
import numpy as np
predictions = predictions_output.predictions

predicted_labels = np.argmax(predictions, axis=1)

### Compare initial labels and prediction

In [21]:
predicted_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

In [22]:
actual_labels = test_dataset.tensors[2].numpy()

In [23]:
actual_labels

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0])