<a href="https://colab.research.google.com/github/rishabhshah13/DCP_Capstone/blob/main/model_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch import cuda
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from google.colab import drive


device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
model_path = "des-classification-model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
nlp= pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Move the tensors to the same device as the model
    inputs = inputs.to("cuda")

    # Ensure the model is in evaluation mode and on the correct device
    model.eval()
    model.to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return pred_label


In [None]:
final_df = pd.read_csv('/content/all_labelled_data_for_training.csv')

In [None]:
final_df['Company Li Description with null'] = final_df['Company Li Description'].copy()
final_df['Company Li Description with null'] = final_df['Company Li Description with null'].fillna('null')
final_df['Company Des Relevant Score'] = final_df['Company Li Description with null'].apply(predict)

final_df.to_csv('/content/final_df.csv', index=False)