# Merge Annotated Data

In [1]:
import logging
import pandas as pd

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.FileHandler("data_merge.log"), logging.StreamHandler()],
)

In [3]:
glassdoor_reviews_df = pd.read_csv("./glassdoor_reviews.csv")

In [4]:
glassdoor_reviews_df.head()

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",15 de dez. de 2023,5.0,1
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,15 de dez. de 2023,5.0,-1
2,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Equipe bem prestativa e ótima de se trabalhar.,11 de mar. de 2023,4.0,1
3,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Modo home office ainda tem que ser melhorado.,11 de mar. de 2023,4.0,-1
4,60212043,Tecnomapas,Funcionário confidencial,"Ex-funcionário(a), menos de um ano",Única vantagem era o trabalho ser home office,24 de fev. de 2022,1.0,1


In [5]:
glassdoor_reviews_df["annotated"] = 0

In [6]:
glassdoor_reviews_df.head()

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",15 de dez. de 2023,5.0,1,0
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,15 de dez. de 2023,5.0,-1,0
2,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Equipe bem prestativa e ótima de se trabalhar.,11 de mar. de 2023,4.0,1,0
3,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Modo home office ainda tem que ser melhorado.,11 de mar. de 2023,4.0,-1,0
4,60212043,Tecnomapas,Funcionário confidencial,"Ex-funcionário(a), menos de um ano",Única vantagem era o trabalho ser home office,24 de fev. de 2022,1.0,1,0


In [7]:
def merge_annotated_reviews(annotated_df):
    for index, row in annotated_df.iterrows():
        annotated_review_id = row["review_id"]
        annotated_review_text = row["review_text"]
        annotated_sentiment = row["sentiment"]

        filtered_glassdoor_df = glassdoor_reviews_df.loc[
            (glassdoor_reviews_df["review_id"] == annotated_review_id)
            & (glassdoor_reviews_df["review_text"] == annotated_review_text)
            & (glassdoor_reviews_df["sentiment"] != annotated_sentiment)
        ]["review_text"]
        if not filtered_glassdoor_df.empty:
            filtered_index = glassdoor_reviews_df.loc[
                (glassdoor_reviews_df["review_id"] == annotated_review_id)
                & (glassdoor_reviews_df["review_text"] == annotated_review_text)
                & (glassdoor_reviews_df["sentiment"] != annotated_sentiment)
            ].index

            logging.info(
                f"sentiment anotado: {annotated_sentiment}; "
                f"sentiment extraído: {glassdoor_reviews_df.loc[filtered_index, 'sentiment'].values[0]}; "
                f"review_id: {glassdoor_reviews_df.loc[filtered_index, 'review_id'].values[0]}; "
                f"review_text: >>>{glassdoor_reviews_df.loc[filtered_index, 'review_text'].values[0]}<<< "
            )

            glassdoor_reviews_df.loc[filtered_index, "sentiment"] = annotated_sentiment
            glassdoor_reviews_df.loc[filtered_index, "annotated"] = 1

### Merge annotated positive reviews

In [8]:
glassdoor_reviews_df[glassdoor_reviews_df["review_id"] == 68952889]

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated
90,68952889,Abaco Tecnologia de Informação Ltda,Analista De Sistemas,Funcionário(a) atual,Pensando alguma coisa boa sobre prós.,12 de set. de 2022,4.0,1,0
91,68952889,Abaco Tecnologia de Informação Ltda,Analista De Sistemas,Funcionário(a) atual,Sem comentários sobre coisas contra.,12 de set. de 2022,4.0,-1,0


In [9]:
reviewed_predicted_non_positive_reviews_df = pd.read_csv(
    "./reviewed_predicted_non_positive_reviews.csv"
)

In [None]:
merge_annotated_reviews(reviewed_predicted_non_positive_reviews_df)

In [11]:
glassdoor_reviews_df[glassdoor_reviews_df["annotated"] == 1].shape

(48, 9)

In [12]:
glassdoor_reviews_df[glassdoor_reviews_df["annotated"] == 1]["sentiment"].value_counts()

sentiment
 0    33
-1    15
Name: count, dtype: int64

### Merge annotated negative reviews

In [13]:
reviewed_predicted_non_negative_reviews_df = pd.read_csv(
    "./reviewed_predicted_non_negative_reviews.csv"
)

In [None]:
merge_annotated_reviews(reviewed_predicted_non_negative_reviews_df)

In [15]:
glassdoor_reviews_df[glassdoor_reviews_df["annotated"] == 1].shape

(313, 9)

In [16]:
glassdoor_reviews_df[glassdoor_reviews_df["annotated"] == 1]["sentiment"].value_counts()

sentiment
 0    247
 1     49
-1     17
Name: count, dtype: int64

In [17]:
glassdoor_reviews_df.to_csv("glassdoor_reviews_annotated.csv", index=False)

### Annotation Evaluation

#### Non Positive predicted:  
  - 1283 Positive Reviews extract
  - 650 Non Positive Reviews predicted

  Annotation over Non Positive Reviews predicted results:
  | Negative | Neutral | Positive |
  |----------|---------|----------|
  |    15    |   33    |          |

#### Non Negative predicted:  
  - 1283 Negative Reviews extract
  - 821 Non Negative Reviews predicted

  Annotation over Non Negative Reviews predicted results:
  | Negative | Neutral | Positive |
  |----------|---------|----------|
  |    2     |   214   |    49    |