# Clean duplicated rows of Glassdoor reviews from Tecnomapas, since it had duplicated .html extracted files.

In [1]:
import pandas as pd

from datetime import datetime

In [2]:
def date_str_to_date(date_str):
    month_dict = {
        "jan.": 1,
        "fev.": 2,
        "mar.": 3,
        "abr.": 4,
        "mai.": 5,
        "jun.": 6,
        "jul.": 7,
        "ago.": 8,
        "set.": 9,
        "out.": 10,
        "nov.": 11,
        "dez.": 12,
    }

    date_split = date_str.split()
    day = int(date_split[0])
    month = date_split[2]
    year = int(date_split[4])

    return datetime(year, month_dict[month], day, 0, 0, 0)

## Convert glassdoor_reviews_predicted review_date

In [9]:
glassdoor_reviews_predicted_df = pd.read_csv("./glassdoor_reviews_predicted.csv")

In [10]:
glassdoor_reviews_predicted_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated,predicted_sentiment
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",15 de dez. de 2023,5.0,1,0,1
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,15 de dez. de 2023,5.0,0,1,0


In [11]:
glassdoor_reviews_predicted_df.shape

(2532, 10)

In [12]:
glassdoor_reviews_predicted_df["review_date"] = glassdoor_reviews_predicted_df[
    "review_date"
].apply(date_str_to_date)

In [13]:
glassdoor_reviews_predicted_df.shape

(2532, 10)

In [14]:
glassdoor_reviews_predicted_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated,predicted_sentiment
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,0,1
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,0,1,0


In [15]:
glassdoor_reviews_predicted_df.to_csv(
    f"./glassdoor_reviews_predicted.csv",
    index=False,
    # sep=";",
)

## Convert glassdoor_reviews_annotated reviews_date

In [16]:
glassdoor_reviews_annotated_df = pd.read_csv(
    "../data_preparation/glassdoor_reviews_annotated.csv"
)

In [17]:
glassdoor_reviews_annotated_df.shape

(2532, 9)

In [18]:
glassdoor_reviews_annotated_df["review_date"] = glassdoor_reviews_annotated_df[
    "review_date"
].apply(date_str_to_date)

In [19]:
glassdoor_reviews_annotated_df.shape

(2532, 9)

In [20]:
glassdoor_reviews_annotated_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,annotated
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,0
1,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,0,1


In [21]:
glassdoor_reviews_annotated_df.to_csv(
    f"../data_preparation/glassdoor_reviews_annotated.csv",
    index=False,
    # sep=";",
)

## Conver predicted_non_negative_reviews reviews_date

In [22]:
predicted_non_negative_reviews_df = pd.read_csv(
    "../data_preparation/predicted_non_negative_reviews.csv"
)

In [23]:
predicted_non_negative_reviews_df.shape

(809, 10)

In [24]:
predicted_non_negative_reviews_df["review_date"] = predicted_non_negative_reviews_df[
    "review_date"
].apply(date_str_to_date)

In [25]:
predicted_non_negative_reviews_df.shape

(809, 10)

In [26]:
predicted_non_negative_reviews_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,predicted_label,predicted_score
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,-1,Neutral,0.879178
1,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Modo home office ainda tem que ser melhorado.,2023-03-11,4.0,-1,Neutral,0.932563


In [27]:
predicted_non_negative_reviews_df.to_csv(
    f"../data_preparation/predicted_non_negative_reviews.csv",
    index=False,
    # sep=";",
)

## Convert predicted_non_positive_reviews reviews_date

In [28]:
predicted_non_positive_reviews_df = pd.read_csv(
    "../data_preparation/predicted_non_positive_reviews.csv"
)

In [29]:
predicted_non_positive_reviews_df.shape

(642, 10)

In [30]:
predicted_non_positive_reviews_df["review_date"] = predicted_non_positive_reviews_df[
    "review_date"
].apply(date_str_to_date)

In [31]:
predicted_non_positive_reviews_df.shape

(642, 10)

In [32]:
predicted_non_positive_reviews_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,predicted_label,predicted_score
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,Neutral,0.814863
1,60212043,Tecnomapas,Funcionário confidencial,"Ex-funcionário(a), menos de um ano",Única vantagem era o trabalho ser home office,2022-02-24,1.0,1,Neutral,0.980318


In [33]:
predicted_non_positive_reviews_df.to_csv(
    f"../data_preparation/predicted_non_positive_reviews.csv",
    index=False,
    # sep=";",
)

## Convert reviewed_predicted_non_negative_reviews reviews_date

In [34]:
reviewed_predicted_non_negative_reviews_df = pd.read_csv(
    "../data_preparation/reviewed_predicted_non_negative_reviews.csv"
)

In [27]:
reviewed_predicted_non_negative_reviews_df.shape

(821, 11)

In [35]:
reviewed_predicted_non_negative_reviews_df["review_date"] = reviewed_predicted_non_negative_reviews_df[
    "review_date"
].apply(date_str_to_date)

In [36]:
reviewed_predicted_non_negative_reviews_df.shape

(809, 11)

In [37]:
reviewed_predicted_non_negative_reviews_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,predicted_label,predicted_score,reviewed
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano",Não tive nenhum ponto negativo,2023-12-15,5.0,0,Neutral,0.879178,1.0
1,74420027,Tecnomapas,Analista Desenvolvedor,Ex-freelancer,Modo home office ainda tem que ser melhorado.,2023-03-11,4.0,-1,Neutral,0.932563,


In [38]:
reviewed_predicted_non_negative_reviews_df.to_csv(
    f"../data_preparation/reviewed_predicted_non_negative_reviews.csv",
    index=False,
    # sep=";",
)

## Clean reviewed_predicted_non_positive_reviews reviews_date

In [39]:
reviewed_predicted_non_positive_reviews_df = pd.read_csv(
    "../data_preparation/reviewed_predicted_non_positive_reviews.csv"
)

In [40]:
reviewed_predicted_non_positive_reviews_df.shape

(642, 11)

In [41]:
reviewed_predicted_non_positive_reviews_df["review_date"] = (
    reviewed_predicted_non_positive_reviews_df["review_date"].apply(date_str_to_date)
)

In [42]:
reviewed_predicted_non_positive_reviews_df.shape

(642, 11)

In [43]:
reviewed_predicted_non_positive_reviews_df.head(2)

Unnamed: 0,review_id,company,employee_role,employee_detail,review_text,review_date,star_rating,sentiment,predicted_label,predicted_score,reviewed
0,82630669,Tecnomapas,Recepcionista,"Ex-funcionário(a), mais de um ano","Companheirismo entre os colegas, oportunidade ...",2023-12-15,5.0,1,Neutral,0.814863,
1,60212043,Tecnomapas,Funcionário confidencial,"Ex-funcionário(a), menos de um ano",Única vantagem era o trabalho ser home office,2022-02-24,1.0,1,Neutral,0.980318,


In [44]:
reviewed_predicted_non_positive_reviews_df.to_csv(
    f"../data_preparation/reviewed_predicted_non_positive_reviews.csv",
    index=False,
    # sep=";",
)