<a href="https://colab.research.google.com/github/skywalker290/Financial-News-Analyser/blob/main/Bloomberg_Single_Label_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U accelerate --quiet
!pip install -U transformers --quiet

In [None]:

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/skywalker290/Financial-News-Analyser/main/labeled_News_dataset.csv")
df.head()
df['Text']=df['title']+df['text']

In [None]:
counter = 0
df_temp = df.copy()
for i in range(len(df_temp)):
  if df_temp['label'][i] == 0:
    if counter < 2000:
      df_temp.drop(index=[i], inplace=True)
      counter += 1
    else:
      break

df = df_temp.copy()


## Label Encoder

## Model Building

In [None]:
from sklearn.preprocessing import OneHotEncoder
labels = df['label'].values.reshape(-1, 1)
one_hot_encoder = OneHotEncoder(categories='auto', sparse=False)
labels = one_hot_encoder.fit_transform(labels)
texts = df['Text'].tolist()

import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoTokenizer, BloomForSequenceClassification

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,test_size=0.2, random_state=42)
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=5)



# Lets build custom dataset
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }


train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch
from sklearn.metrics import accuracy_score



def single_label_metrics(predictions, labels):
    # Apply softmax activation function to convert predictions to probabilities

    softmax = torch.nn.Softmax(dim=1)
    probs = softmax(torch.Tensor(predictions))

    # Get the index with the highest probability for each sample
    max_prob_indices = np.argmax(probs.numpy(), axis=1)


    # Convert max probability indices to binary predictions
    y_pred = np.zeros_like(predictions)
    for i, idx in enumerate(max_prob_indices):
        y_pred[i, idx] = 1

    # Flatten predictions and labels to match sklearn's input format
    y_pred = y_pred.flatten()
    y_true = labels.flatten()
    print(y_true,y_pred)


    # Calculate metrics
    roc_auc = roc_auc_score(y_true, y_pred)
    hamming = hamming_loss(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    # Store metrics in a dictionary

    metrics = {
        "roc_auc": roc_auc,
        "hamming_loss": hamming,
        "f1": f1,
            "accuracy": accuracy

    }

    return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = single_label_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

trainer.evaluate()


from google.colab import drive
drive.mount('/content/drive')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0. 0. 0. ... 0. 0. 0.] [1. 0. 0. ... 0. 0. 0.]
Mounted at /content/drive


# Predictions

In [None]:
text = """Nine Google
 workers were arrested on trespassing charges Tuesday night after staging a sit-in at the company’s offices in New York and Sunnyvale, California, including a protest in Google Cloud CEO Thomas Kurian’s office.

The arrests, which were livestreamed on Twitch by participants, follow rallies outside Google offices in New York, Sunnyvale and Seattle, which attracted hundreds of attendees, according to workers involved. The protests, led by the “No Tech for Apartheid” organization, focused on Project Nimbus — Google and Amazon’s joint $1.2 billion contract to provide the Israeli government and military with cloud computing services, including artificial intelligence tools, data centers and other cloud infrastructure.

Protesters in Sunnyvale sat in Kurian’s office for more than nine hours until their arrests, writing demands on Kurian’s whiteboard and wearing shirts that read “Googler against genocide.” In New York, protesters sat in a three-floor common space. Five workers from Sunnyvale and four from New York were arrested.

“On a personal level, I am opposed to Google taking any military contracts — no matter which government they’re with or what exactly the contract is about,” Cheyne Anderson, a Google Cloud software engineer based in Washington, told CNBC. “And I hold that opinion because Google is an international company and no matter which military it’s with, there are always going to be people on the receiving end... represented in Google’s employee base and also our user base.” Anderson had flown to Sunnyvale for the protest in Kurian’s office and was one of the workers arrested Tuesday.

“Google Cloud supports numerous governments around the world in countries where we operate, including the Israeli government, with our generally available cloud computing services,” a Google spokesperson told CNBC, adding, “This work is not directed at highly sensitive, classified, or military workloads relevant to weapons or intelligence services.”

The demonstrations show Google’s increased pressure from workers who oppose military use of its AI and cloud technology. Last month, Google Cloud engineer Eddie Hatfield interrupted a keynote speech from the managing director of Google’s Israel business stating, “I refuse to build technology that powers genocide.” Hatfield was subsequently fired. That same week, an internal Google employee message board was shut down after staffers posted comments about the company’s Israeli military contracts. A spokesperson at the time described the posts as “divisive content that is disruptive to our workplace.”

On Oct. 7, Hamas carried out deadly attacks on Israel, killing 1,200 and taking more than 240 hostages.  The following day, Israel declared war and began implementing a siege of Gaza, cutting off access to power, food, water and fuel. At least 33,899 people have been killed in the Gaza Strip since that date, the enclave’s Health Ministry said Wednesday in a statement on Telegram. In January at the U.N.’s top court, Israel rejected genocide charges brought by South Africa.

The Israeli Ministry of Defense reportedly sought consulting services from Google to expand its access to Google Cloud services. Google Photos is one platform used by the Israeli government to conduct surveillance in Gaza, according to The New York Times.

“I think what happened yesterday is evidence that Google’s attempts to suppress all of the voices of opposition to this contract are not only not working but actually having the opposite effect,” Ariel Koren, a former Google employee who resigned in 2022 after leading efforts to oppose the Project Nimbus contract, told CNBC. “It’s really just creating more agitation, more anger and more commitment.”

The New York sit-in started at noon ET and ended around 9:30 p.m. ET. Security asked workers to remove their banner, which spanned two floors, about an hour into the protest, according to Hasan Ibraheem, a Google software engineer based in New York City and one of the arrested workers.

“I realized, ’Oh, the place that I work at is very complicit and aiding in this genocide — I have a responsibility to act against it,″” Hasan Ibraheem, a Google software engineer based in New York City, told CNBC. Ibraheem added, “The fact that I am receiving money from Google and Israel is paying Google -- I am receiving part of that money, and that weighed very heavily on me.”

The New York workers were released from the police station after about four hours.

The nine arrested workers in New York and Sunnyvale told CNBC that, during the protest, they were locked out of their work accounts and offices, placed on administrative leave, and told to wait to return to work until being contacted by HR.

The workers were also protesting their labor conditions — namely “that the company stop the harassment, intimidation, bullying, silencing, and censorship of Palestinian, Arab, Muslim Googlers — and that the company address the health and safety crisis workers, especially those in Google Cloud, are facing due to the potential impacts of their work,” according to a release by the campaign.

“A small number of employee protesters entered and disrupted a couple of our locations,” a Google spokesperson told CNBC. “Physically impeding other employees’ work and preventing them from accessing our facilities is a clear violation of our policies, and we will investigate and take action. These employees were put on administrative leave and their access to our systems was cut. After refusing multiple requests to leave the premises, law enforcement was engaged to remove them to ensure office safety.”
"""


from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

# Load the model
loaded_model = BloomForSequenceClassification.from_pretrained("/content/drive/MyDrive/Bloomberg-Model", num_labels=5)

# Create TrainingArguments
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir="./results",
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

# Create Trainer with the loaded model
new_trainer = Trainer(
    model=loaded_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

predictions = trainer.model(**encoding)



softmax = torch.nn.Softmax(dim=1)
probs = softmax(torch.Tensor(predictions[0]))

probs_cpu = probs.detach().cpu()
probs_numpy = probs_cpu.numpy()
max_prob_indices = np.argmax(probs_numpy, axis=1)

# Convert max probability indices to binary predictions
max_index = np.argmax(probs_numpy)

# Create a one-hot vector
one_hot_vector = np.zeros_like(probs_numpy)
one_hot_vector[0, max_index] = 1


print(one_hot_vector)

decoded_labels = one_hot_encoder.inverse_transform(one_hot_vector.reshape(1,-1))
decoded_labels



decoded_labels






[[1. 0. 0. 0. 0.]]


array([[-2]])

In [None]:
\\