# Predict Glassdoor Reviews
Use trained model to predict sentiment of Glassdoor Reviews.

In [1]:
# @title Environment running
running_local = False  # @param {type:"boolean"}
if running_local:
    running_colab = running_kaggle = False
else:
    running_colab = False  # @param {type:"boolean"}
    running_kaggle = True  # @param {type:"boolean"}

In [2]:
if running_colab:
    from google.colab import drive

    drive.mount("/content/drive")

## Loading the model

In [3]:
import logging
import numpy as np
import pandas as pd
import platform
import random
import torch
import torch.nn as nn

from tqdm import tqdm
from transformers import BertTokenizer, BertModel

In [4]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.FileHandler("report.log"), logging.StreamHandler()],
)

In [5]:
RANDOM_SEED = 103
MODEL_PATH = "neuralmind/bert-base-portuguese-cased"
TOKEN_MAX_LENGTH = 512

PREDICTIONS_PATH = "."

if running_local:
    GLASSDOOR_MODEL_PATH = "../train_model/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin"
    OVERSAMPLED_GLASSDOOR_MODEL_PATH = "../train_model/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin"

    if platform.system() == "Windows":
        MODEL_PATH = "C:\\bert-base-portuguese-cased"
    else:
        MODEL_PATH = "/home/stevillis/bert-base-portuguese-cased"

if running_colab:
    GLASSDOOR_MODEL_PATH = "/content/drive/MyDrive/UFMT/Gestão e Ciência de Dados/Disciplinas/14 - Seminário e Metodologia da Pesquisa/Projetos/glassdoor-reviews-analysis-nlp/train_model/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin"
    OVERSAMPLED_GLASSDOOR_MODEL_PATH = "/content/drive/MyDrive/UFMT/Gestão e Ciência de Dados/Disciplinas/14 - Seminário e Metodologia da Pesquisa/Projetos/glassdoor-reviews-analysis-nlp/train_model/bertimbau-glassdoor-reviews-oversampled-epoch_5.bin"
    PREDICTIONS_PATH = "/content/drive/MyDrive/UFMT/Gestão e Ciência de Dados/Disciplinas/14 - Seminário e Metodologia da Pesquisa/Projetos/glassdoor-reviews-analysis-nlp/report"
if running_kaggle:
    GLASSDOOR_MODEL_PATH = "/kaggle/input/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin/pytorch/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5/1/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin"
    OVERSAMPLED_GLASSDOOR_MODEL_PATH = "/kaggle/input/bertimbau-glassdoor-reviews-oversampled-freezing/pytorch/default/1/bertimbau-glassdoor-reviews-oversampled-freezing-epoch_5.bin"

In [6]:
MODEL_PATH

'neuralmind/bert-base-portuguese-cased'

In [7]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x78f8361fed90>

In [8]:
random.seed(RANDOM_SEED)

In [9]:
np.random.seed(RANDOM_SEED)

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: Tesla T4


In [11]:
if running_colab:
    dataset = pd.read_csv(
        "/content/drive/MyDrive/UFMT/Gestão e Ciência de Dados/Disciplinas/14 - Seminário e Metodologia da Pesquisa/Projetos/glassdoor-reviews-analysis-nlp/data_preparation/glassdoor_reviews_annotated.csv"
    )
else:
    if running_kaggle:
        dataset = pd.read_csv(
            "/kaggle/input/glassdoor-reviews-predicted/glassdoor_reviews_predicted.csv"
        )
    else:
        dataset = pd.read_csv("../data_preparation/glassdoor_reviews_annotated.csv")

In [12]:
dataset.shape

(2532, 10)

In [13]:
dataset["sentiment"].value_counts()

sentiment
 1    1269
-1    1021
 0     242
Name: count, dtype: int64

In [14]:
dataset.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated,predicted_sentiment
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,0,1
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,0,1,0


In [15]:
num_labels = len(dataset["sentiment"].value_counts())

In [16]:
num_labels

3

In [17]:
class_names = ["neutral", "positive", "negative"]

## Creating a PyTorch Model

In [18]:
class GlassdoorReviewsClassifier(nn.Module):
    def __init__(self, num_labels):
        super(GlassdoorReviewsClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_PATH)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs["last_hidden_state"][:, 0, :]
        x = self.classifier(x)
        return x

In [19]:
model = GlassdoorReviewsClassifier(num_labels).to(device)
model.load_state_dict(torch.load(OVERSAMPLED_GLASSDOOR_MODEL_PATH, map_location=device))
model.eval()

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  model.load_state_dict(torch.load(OVERSAMPLED_GLASSDOOR_MODEL_PATH, map_location=device))


GlassdoorReviewsClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## Prediction over annotated dataset

In [20]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [21]:
def convert_to_str(input_value):
    if isinstance(input_value, np.ndarray):
        input_str = " ".join(input_value)
    else:
        input_str = input_value

    return input_str

In [22]:
def predict_sentiment(texts):
    outputs = []
    for txt in texts:
        encoded_texts = tokenizer(
            convert_to_str(texts),
            max_length=TOKEN_MAX_LENGTH,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        input_ids = encoded_texts["input_ids"].to(device)
        attention_mask = encoded_texts["attention_mask"].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask)
            probabilities = torch.nn.functional.softmax(output, dim=1)
            outputs.append(probabilities.cpu().numpy())

    return np.concatenate(outputs, axis=0)

### Predict reviews sentiment

In [23]:
dataset["predicted_sentiment"] = pd.Series(dtype="int")

In [24]:
total_iterations = len(dataset)
total_iterations

2532

In [25]:
for index, row in tqdm(dataset.iterrows(), total=total_iterations, desc="Processing"):
    output_probabilities = predict_sentiment(row["review_text"])
    predicted_sentiment = np.argmax(output_probabilities)
    # predicted_sentiment_label = class_names[predicted_sentiment]
    dataset.loc[index, "predicted_sentiment"] = predicted_sentiment

    if index > 0 and index % 100 == 0:
        logging.info(f"Predicted rows: {index}/{len(dataset)}")
        logging.info(
            f"Review Text: {row['review_text']}; Predicted Sentiment: {predicted_sentiment}"
        )

Processing: 100%|██████████| 2532/2532 [2:17:01<00:00,  3.25s/it]  


In [26]:
dataset["predicted_sentiment"] = dataset["predicted_sentiment"].astype(int)

In [27]:
dataset.head(3)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated,predicted_sentiment
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,0,1
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,0,1,0
2,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Equipe bem prestativa e ótima de se trabalhar.,2023-03-11,4.0,1,0,1


In [28]:
dataset["predicted_sentiment"].value_counts()

predicted_sentiment
1    1274
2    1004
0     254
Name: count, dtype: int64

In [29]:
dataset.to_csv(
    f"{PREDICTIONS_PATH}/glassdoor_reviews_predicted.csv",
    index=False,
)

In [30]:
if running_kaggle:
    %cd /kaggle/working
    from IPython.display import FileLink
    FileLink('/kaggle/working/glassdoor_reviews_predicted.csv')

/kaggle/working
