In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertConfig
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder


In [4]:
# Load the data
file1 = pd.read_csv('/content/sample_data/files/all_risk_categories.csv')
file2 = pd.read_csv('/content/sample_data/files/New_Results_QTR1.csv')

# Encode the categories
label_encoder = LabelEncoder()
file1['label'] = label_encoder.fit_transform(file1['category'])

# Split the data for training and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    file1['Item1A'].tolist(), file1['label'].tolist(), test_size=0.2, random_state=42
)


In [6]:
print(type(train_texts))
print(type(val_texts))

# Ensure each element is a string
print(type(train_texts[0]))
print(type(val_texts[0]))


<class 'list'>
<class 'list'>
<class 'str'>
<class 'str'>


In [14]:
# Print a few examples from train_texts and val_texts
print("Sample train_texts:", train_texts[:5])
print("Sample val_texts:", val_texts[:5])


Sample train_texts: ["[O]ur or our customers' sensitive, proprietary, or confidential information could be leaked, disclosed, orrevealed as a result of or in connection with our employees', personnel's, or vendors' use of generative AItechnologies. Any such information that we input into a third-party generative AI or machine learning (“ML”) platform could be revealed to others, including if information is used to train the third party's\nAI/ML models. Additionally, where an AI/ML model ingests personal information\nand makes connections\nusing such data, those technologies may reveal other sensitive, proprietary, or confidential information\ngenerated by the model. Moreover, AI/ML models may create incomplete, inaccurate, or otherwise flawed\noutputs, some of which may\nappear correct. We may use AI/ML outputs to make certain decisions. Due to\nthese potential flaws, the model could lead us to make decisions that could bias certain individuals or classes\nof individuals and adversely 

In [15]:
# Replace non-string values with empty strings, then replace newlines and trim spaces
train_texts_cleaned = [str(text).replace('\n', ' ').strip() if isinstance(text, str) else "" for text in train_texts]
val_texts_cleaned = [str(text).replace('\n', ' ').strip() if isinstance(text, str) else "" for text in val_texts]

# Print cleaned sample data
print("Cleaned Sample train_texts:", train_texts_cleaned[:5])
print("Cleaned Sample val_texts:", val_texts_cleaned[:5])

Cleaned Sample train_texts: ["[O]ur or our customers' sensitive, proprietary, or confidential information could be leaked, disclosed, orrevealed as a result of or in connection with our employees', personnel's, or vendors' use of generative AItechnologies. Any such information that we input into a third-party generative AI or machine learning (“ML”) platform could be revealed to others, including if information is used to train the third party's AI/ML models. Additionally, where an AI/ML model ingests personal information and makes connections using such data, those technologies may reveal other sensitive, proprietary, or confidential information generated by the model. Moreover, AI/ML models may create incomplete, inaccurate, or otherwise flawed outputs, some of which may appear correct. We may use AI/ML outputs to make certain decisions. Due to these potential flaws, the model could lead us to make decisions that could bias certain individuals or classes of individuals and adversely 

In [16]:
import re

def clean_text(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ')
    # Remove special characters like brackets
    text = re.sub(r'[^\w\s]', '', text)
    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the clean_text function to train_texts and val_texts
train_texts_cleaned = [clean_text(str(text)) if isinstance(text, str) else "" for text in train_texts]
val_texts_cleaned = [clean_text(str(text)) if isinstance(text, str) else "" for text in val_texts]


# Print cleaned sample data
print("Cleaned Sample train_texts:", train_texts_cleaned[:5])
print("Cleaned Sample val_texts:", val_texts_cleaned[:5])

Cleaned Sample train_texts: ['Our or our customers sensitive proprietary or confidential information could be leaked disclosed orrevealed as a result of or in connection with our employees personnels or vendors use of generative AItechnologies Any such information that we input into a thirdparty generative AI or machine learning ML platform could be revealed to others including if information is used to train the third partys AIML models Additionally where an AIML model ingests personal information and makes connections using such data those technologies may reveal other sensitive proprietary or confidential information generated by the model Moreover AIML models may create incomplete inaccurate or otherwise flawed outputs some of which may appear correct We may use AIML outputs to make certain decisions Due to these potential flaws the model could lead us to make decisions that could bias certain individuals or classes of individuals and adversely impact their rights As a result we co

In [17]:
# Apply the clean_text function to the Item1A column of file2
file2['Item1A_cleaned'] = file2['Item1A'].apply(lambda x: clean_text(str(x)) if isinstance(x, str) else "")

print("Cleaned Sample Item1A from file2:", file2['Item1A_cleaned'].head())


Cleaned Sample Item1A from file2: 0    Risk Factors Our operations and financial resu...
1    Risk Factors Our business prospects financial ...
2    Risk Factors RISK FACTORS A description of the...
3    Risk Factors An investment in our securities i...
4    Ri sk Factors FORWARDLOOKING STATEMENTS This A...
Name: Item1A_cleaned, dtype: object


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts_cleaned, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts_cleaned, truncation=True, padding=True, max_length=256)

# Retry tokenization with cleaned data
try:
    train_encodings = tokenizer(train_texts_cleaned, truncation=True, padding=True, max_length=256)
    val_encodings = tokenizer(val_texts_cleaned, truncation=True, padding=True, max_length=256)
    print("Tokenization successful!")
except ValueError as e:
    print(f"Tokenization failed: {e}")


Tokenization successful!


In [19]:
# Tokenize the cleaned Item1A content in file2
test_encodings = tokenizer(file2['Item1A_cleaned'].tolist(), truncation=True, padding=True, max_length=256)

# Print the first few tokenized examples to verify
print("Tokenized Sample Item1A from file2:", test_encodings['input_ids'][:5])

Tokenized Sample Item1A from file2: [[101, 3891, 5876, 2256, 3136, 1998, 3361, 3463, 2024, 3395, 2000, 3365, 10831, 1998, 9662, 7368, 2164, 2216, 2649, 2917, 2029, 2089, 2031, 1037, 3430, 1998, 15316, 3466, 2006, 2256, 2449, 3463, 1997, 3136, 5356, 6223, 3361, 3785, 1998, 1996, 6202, 3976, 1997, 2256, 2691, 4518, 1996, 10831, 1998, 9662, 7368, 2649, 2917, 2024, 2025, 1996, 2069, 3924, 5307, 2149, 3176, 10831, 1998, 9662, 7368, 2025, 12825, 2124, 2000, 2149, 2030, 2008, 2057, 2747, 9266, 2213, 10047, 8585, 14482, 2036, 2089, 17727, 11215, 2256, 2449, 3136, 2017, 2323, 5136, 2122, 10831, 1998, 9662, 7368, 5362, 2362, 2007, 2035, 1997, 1996, 2060, 2592, 2443, 2030, 5100, 2011, 4431, 1999, 2023, 3296, 3189, 2006, 2433, 1047, 2065, 2151, 1997, 1996, 2206, 10831, 2941, 5258, 2256, 2449, 3361, 4650, 3463, 1997, 3136, 1998, 2925, 16746, 2071, 2022, 3430, 2135, 1998, 15316, 2135, 5360, 2017, 2323, 2025, 17841, 2256, 19380, 1997, 2151, 1997, 1996, 2206, 10831, 2000, 19515, 2008, 2107, 10831, 203

In [20]:
class RiskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RiskDataset(train_encodings, train_labels)
val_dataset = RiskDataset(val_encodings, val_labels)


In [22]:
test_dataset = RiskDataset(test_encodings, [0]*len(test_encodings['input_ids']))

In [23]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",     # Evaluate every `logging_steps`
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()




Step,Training Loss,Validation Loss


TrainOutput(global_step=6, training_loss=1.9402179718017578, metrics={'train_runtime': 117.7615, 'train_samples_per_second': 0.306, 'train_steps_per_second': 0.051, 'total_flos': 3811170279600.0, 'train_loss': 1.9402179718017578, 'epoch': 3.0})

In [26]:
import torch
import numpy as np

# Define the batch size
batch_size = 32  # You can adjust this depending on your memory capacity
predictions = []

# Iterate over the dataset in batches
for i in range(0, len(test_dataset), batch_size):
    batch = test_dataset[i:i+batch_size]
    batch_predictions = trainer.predict(batch)
    predictions.append(batch_predictions.predictions)

# Combine all batches into a single tensor and find the predicted labels
predictions_tensor = torch.tensor(np.concatenate(predictions, axis=0))
predicted_labels = torch.argmax(predictions_tensor, dim=1)


KeyError: 0

In [27]:
from torch.utils.data import DataLoader

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [28]:
import torch
import numpy as np

predictions = []

# Iterate over the dataloader in batches
for batch in test_dataloader:
    # Move the batch data to the correct device (e.g., GPU if available)
    for key in batch:
        batch[key] = batch[key].to(trainer.args.device)

    # Get predictions for the current batch
    with torch.no_grad():  # Disable gradient calculation for inference
        batch_predictions = trainer.model(**batch)

    # Append the logits (model outputs) to predictions list
    predictions.append(batch_predictions.logits.cpu().numpy())

# Combine all batches into a single array and find the predicted labels
predictions_tensor = torch.tensor(np.concatenate(predictions, axis=0))
predicted_labels = torch.argmax(predictions_tensor, dim=1)


In [29]:
# Decode the predicted labels
predicted_categories = label_encoder.inverse_transform(predicted_labels.numpy())

# Save the output to a new CSV
output = pd.DataFrame({
    'filename': file2['filename'],
    'Item1A': file2['Item1A'],
    'category': predicted_categories,
    'KeywordMentioned': file2['KeywordMentioned']
})
output.to_csv('output_QTR1.csv', index=False)


In [30]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [31]:
df1 = pd.read_csv('/content/output_QTR1.csv')
df1.head()

Unnamed: 0,filename,Item1A,category,KeywordMentioned
0,20230301_10-K_edgar_data_1831915_0000950170-23...,. Risk Factors Our operations and financial re...,Third-Party Risk,artificial intelligence
1,20230221_10-K_edgar_data_1410384_0001410384-23...,". Risk Factors. Our business, prospects, finan...",Third-Party Risk,"machine learning, machine learning"
2,20230228_10-K_edgar_data_1315098_0001315098-23...,. Risk Factors RISK FACTORS A description of t...,Third-Party Risk,"generative ai, artificial intelligence, machin..."
3,20230113_10-K_edgar_data_1829966_0001683168-23...,. Risk Factors. An investment in our securitie...,Third-Party Risk,business intelligence
4,20230217_10-K_edgar_data_871763_0000950170-23-...,. Ri sk Factors FORWARD-LOOKING STATEMENTS Thi...,Third-Party Risk,"artificial intelligence, machine learning, art..."
