### Load dataset

In [1]:
import pandas as pd

In [2]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = 'https://drive.google.com/file/d/1XKcwl1xDcV3PwErWuA47ZPPnqupayxFo/view?usp=drive_link'

In [3]:
file_id = '1XKcwl1xDcV3PwErWuA47ZPPnqupayxFo'

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('subset_result.csv')

In [4]:
df = pd.read_csv('subset_result.csv')

In [5]:
df.columns

Index(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report',
       'sic', 'state_of_inc', 'state_location', 'fiscal_year_end',
       'filing_html_index', 'htm_filing_link', 'complete_text_filing_link',
       'filename', 'item_1', 'item_1A', 'item_1B', 'item_2', 'item_3',
       'item_4', 'item_5', 'item_6', 'item_7', 'item_7A', 'item_8', 'item_9',
       'item_9A', 'item_9B', 'item_10', 'item_11', 'item_12', 'item_13',
       'item_14', 'item_15', 'symbol', 'price_0', 'price_7', 'price_30',
       'price_90', 'high_7', 'high_30', 'high_90'],
      dtype='object')

### Only extract data we need

In [6]:
df_2 = df[['item_1A', 'high_7']]

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

#extract the last 4 sentences
def extract_last_4_sentences(text):
    # Check if the value is already a string
    if isinstance(text, str):
        sentences = sent_tokenize(text)
        if len(sentences) >= 4:
            return ' '.join(sentences[-4:])
        else:
            #if there are fewer than 4 sentences, return the original text
            return text
    else:
        #convert non-string values to strings
        return str(text)

df_2['item_1A'] = df_2['item_1A'].apply(extract_last_4_sentences)

print(df_2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                               item_1A  high_7
0    •\nIn May 2020, the Company elected to carryba...       0
1    Our goal is ultimately to realize gains upon o...       1
2    If we\nfail to maintain our existing relations...       1
3    If new or more stringent federal, state or loc...       0
4    Item 1A. RISK FACTORS\nItem 1A. Risk Factors i...       1
..                                                 ...     ...
995  Our certificate of incorporation and bylaws af...       0
996  Stockholders in our company will be deemed to ...       1
997  We do not know whether our securities will be ...       1
998  Opportunities may arise in the area of potenti...       0
999  If securities or industry analysts do not cont...       0

[1000 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['item_1A'] = df_2['item_1A'].apply(extract_last_4_sentences)


In [8]:
df_2.head()

Unnamed: 0,item_1A,high_7
0,"•\nIn May 2020, the Company elected to carryba...",0
1,Our goal is ultimately to realize gains upon o...,1
2,If we\nfail to maintain our existing relations...,1
3,"If new or more stringent federal, state or loc...",0
4,Item 1A. RISK FACTORS\nItem 1A. Risk Factors i...,1


In [9]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

#preclean text data
def clean_text(text):
    if pd.isna(text):
        return "missing"
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    #tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]  # Lemmatization and stopword removal
    return ' '.join(tokens)

# Apply cleaning function to your columns
# List of columns that contain 'item' in their name
item_columns = [col for col in df_2.columns if 'item' in col]

# Apply cleaning function to each item column
for col in item_columns:
    df_2[f'{col}_clean'] = df_2[col].apply(clean_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2[f'{col}_clean'] = df_2[col].apply(clean_text)


In [10]:
df_2.head()

Unnamed: 0,item_1A,high_7,item_1A_clean
0,"•\nIn May 2020, the Company elected to carryba...",0,in may the company elected to carryback its fi...
1,Our goal is ultimately to realize gains upon o...,1,our goal is ultimately to realize gains upon o...
2,If we\nfail to maintain our existing relations...,1,if we fail to maintain our existing relationsh...
3,"If new or more stringent federal, state or loc...",0,if new or more stringent federal state or loca...
4,Item 1A. RISK FACTORS\nItem 1A. Risk Factors i...,1,item a risk factors item a risk factors is not...


In [11]:
df_2.drop(columns=['item_1A'])

Unnamed: 0,high_7,item_1A_clean
0,0,in may the company elected to carryback its fi...
1,1,our goal is ultimately to realize gains upon o...
2,1,if we fail to maintain our existing relationsh...
3,0,if new or more stringent federal state or loca...
4,1,item a risk factors item a risk factors is not...
...,...,...
995,0,our certificate of incorporation and bylaws af...
996,1,stockholders in our company will be deemed to ...
997,1,we do not know whether our securities will be ...
998,0,opportunities may arise in the area of potenti...


### Split dataset

In [12]:
df_train = df_2.iloc[:800]
df_test = df_2.iloc[801:1000]

In [13]:
!pip install -q transformers torch

In [26]:
M = 30000
batch_size = 32
epochs = 5
lr_init = 1e-5
max_len = 128
warmup_steps = 3

In [15]:
from typing import List, Tuple
import torch
from transformers import DistilBertTokenizerFast
from transformers.tokenization_utils_base import BatchEncoding

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [27]:
import pandas as pd

def batch_data(data: pd.DataFrame, bsize: int) -> List[Tuple[BatchEncoding, torch.Tensor, List[str]]]:
    batches = []

    for start_idx in range(0, data.shape[0], bsize):
        #slice the dataframe into mini-batches, theis method is generated by LLM
        batch_data = data.iloc[start_idx:start_idx + bsize]

        #tokenize the content
        X = tokenizer.batch_encode_plus(
            batch_data['item_1A_clean'].tolist(),
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        Y = torch.tensor(batch_data['high_7'].tolist())

        #get the sentences
        sentences = batch_data['item_1A_clean'].tolist()

        #append the batch to our list of batches
        batches.append((X, Y, sentences))

    return batches

In [28]:
train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [29]:
from tqdm import tqdm
import torch
from transformers import DistilBertForSequenceClassification, \
  AdamW, get_linear_schedule_with_warmup

# Transformer model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=2,
    output_hidden_states=True)

# The torch `device` on which to execute the model computation
if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
else:
    device = torch.device('cpu') # CPU
model.to(device)

# The gradient descent optimizer used for fine tuning
optimizer = AdamW(model.parameters(), lr=lr_init)

# The gradient descent learning rate
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps,
    num_training_steps=len(train_batches))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [30]:
from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score
import torch.nn.functional as F
import numpy as np

from tqdm.auto import tqdm

def runner(batches, desc: str, train=True, return_embeddings=False):
    model.train() if train else model.eval()

    total_loss = 0
    predictions, true_labels, all_embeddings = [], [], []

    for batch in tqdm(batches, desc=desc):
        inputs = batch[0]
        labels = batch[1].to(device)

        #move inputs to the appropriate device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        #reset gradients if in training mode
        if train:
            optimizer.zero_grad()

        #return hidden states if we want embeddings
        outputs = model(**inputs, labels=labels, output_hidden_states=return_embeddings)
        loss = outputs.loss
        logits = outputs.logits

        #returning embeddings, extract and collect them
        if return_embeddings:
            embeddings = outputs.hidden_states[-1].detach().cpu().numpy()
            all_embeddings.append(embeddings)

        if train:
            loss.backward()
            optimizer.step()
            lr.step()

        #update tracking variables
        total_loss += loss.item()
        predictions.extend(logits.argmax(dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    #average loss and performance metrics
    avg_loss = total_loss / len(batches)
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    #concatenate embedings
    all_embeddings = np.concatenate(all_embeddings, axis=0) if return_embeddings else None

    return predictions, avg_loss, accuracy, f1, all_embeddings

In [31]:
#train the model and evaluate it
train_results = runner(train_batches, desc="Training", train=True)
test_results = runner(test_batches, desc="Testing", train=False, return_embeddings=False)

Training:   0%|          | 0/25 [00:00<?, ?it/s]

Testing:   0%|          | 0/7 [00:00<?, ?it/s]

In [32]:
test_results[1:4]
#the first number is avg_loss
#the secone number is accuracy, and the third number is f1 score.
# if all outpus are needed, only run 'test_results' will show all outputs.

(0.6818699496132987, 0.5879396984924623, 0.45716190925300987)

In [33]:
test_results[0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [34]:
test_results[0].count(1)

194