In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading the Pre-Trained BERT Model and Tokenizer

* Loads the fine-tuned BERT model from a directory in Google Drive.
* Loads the tokenizer for text preprocessing (using bert-base-uncased).
* Retrieves the maximum token limit for the model to ensure inputs are properly truncated or padded.
* Reads the input data from a CSV file

In [None]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Define the directory path in Google Drive
model_directory = '/content/drive/MyDrive/bert_model'

# Load the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_directory)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define maximum token limit for BERT
max_length = model.config.max_position_embeddings

# Load data from CSV file
csv_file_path = '/content/drive/MyDrive/merged_data.csv'
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Filtering Rows by Date

* Ensures the DataFrame only retains the first 100 rows for each unique date.

* This is done using the groupby and head methods.

In [None]:
def filter_by_date(df, n=100):
  return df.groupby('date').head(n)

result_df = filter_by_date(df.copy(), n=100)

In [None]:
df=result_df.copy()

# Defining the Prediction Function

* Processes the input text using the tokenizer.
* Performs a forward pass through the model to compute logits and predicts the sentiment class (0 or 1).
* Handles any exceptions and returns None in case of an error during prediction.

In [None]:
def predict(text):
  try:
      inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
      outputs = model(**inputs)
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=1).cpu().numpy()
      return predictions[0]
  except Exception as e:
        return None


# Applying the Prediction Function

* Applies the predict function to the news column of the DataFrame, creating a new column Sentiment to store the predictions.

In [None]:
df['Sentiment'] = df['news'].apply(predict)

# Saving the Updated DataFrame

In [None]:
# Save the updated DataFrame back to CSV
output_csv_path = '/content/drive/MyDrive/data_with_predictions.csv'
df.to_csv(output_csv_path, index=False)

print(f"Predictions saved to {output_csv_path}")


Predictions saved to /content/drive/MyDrive/data_with_predictions.csv
