## Employee Reviews da Microsoft 
### Objetivo:
Desenvolver um sistema que analisa avaliações de funcionários da Microsoft, processa os textos e classifica automaticamente cada review como positivo ou negativo, com base no conteúdo textual.

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

### Coleta e Organização dos Dados

In [11]:
#Importando e filtrando pra puxar somente os dados da Microsoft

import pandas as pd

df = pd.read_csv("employee_reviews.csv")

# Filtrando as avaliações da Microsoft
df = df[df['company'].str.lower() == 'microsoft'].reset_index(drop=True)

df.columns


Index(['Unnamed: 0', 'company', 'location', 'dates', 'job-title', 'summary',
       'pros', 'cons', 'advice-to-mgmt', 'overall-ratings',
       'work-balance-stars', 'culture-values-stars',
       'carrer-opportunities-stars', 'comp-benefit-stars',
       'senior-mangemnet-stars', 'helpful-count', 'link'],
      dtype='object')

### Pré-processamento de Texto

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 1. Criar coluna 'text'
df['text'] = df['pros'].fillna('') + ' ' + df['cons'].fillna('')

# 2. Verificar se 'text' foi criada corretamente
print(df[['pros', 'cons', 'text']].head())

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess)


                                                pros  \
0  Culture, role impact, mission driven, collabor...   
1  1. If you love tech, this is a great place. No...   
2                     Great company and Great people   
3  Benefits, work-life balance, tons of internal ...   
4  Smart people, work life balance, growth mindse...   

                                                cons  \
0          Volume of work is sometimes unmanageable,   
1  Brand on Your Resume: After many years of losi...   
2                         I see no cons at this time   
3                       Can't think of any right now   
4                 Can be hard to transfer internally   

                                                text  
0  Culture, role impact, mission driven, collabor...  
1  1. If you love tech, this is a great place. No...  
2  Great company and Great people I see no cons a...  
3  Benefits, work-life balance, tons of internal ...  
4  Smart people, work life balance, growth mindse..

### Criação dos Rótulos (Positivo/Negativo)

In [14]:
# Rótulo binário: 1 = positivo (rating >= 4), 0 = negativo (rating <= 2)
df = df[df['overall-ratings'].isin([1.0, 2.0, 4.0, 5.0])]
df['label'] = df['overall-ratings'].apply(lambda x: 1 if x >= 4 else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['overall-ratings'].apply(lambda x: 1 if x >= 4 else 0)


### Extração de Features

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [16]:
def predict_sentiment(text, threshold=0.6):
    processed = preprocess(text)
    vectorized = vectorizer.transform([processed])
    prob = model.predict_proba(vectorized)[0][1]
    return int(prob >= threshold), prob

# Exemplo de teste
text = "I loved the leadership and career opportunities."
label, prob = predict_sentiment(text)
print(f"Label: {label}, Confidence: {prob:.2f}")


Label: 1, Confidence: 0.83


### Modelagem e Classificação

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Criação e treinamento do modelo
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predição e avaliação
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2417
           1       0.89      0.84      0.86      2449

    accuracy                           0.87      4866
   macro avg       0.87      0.87      0.87      4866
weighted avg       0.87      0.87      0.87      4866



### Simular Novos Reviews

In [29]:
def predict_review(text, threshold=0.5):
    clean = preprocess(text)
    print(f"Texto limpo: {clean}")

    vec = vectorizer.transform([clean])
    print(f"Vetorizado: {vec.toarray()}")

    prob = model.predict_proba(vec)[0][1]  # Probabilidade de ser classe 1
    print(f"Probabilidade da classe positiva: {prob:.4f}")

    prediction = 1 if prob >= threshold else 0
    print(f"Predição (com threshold={threshold}): {prediction}")

    return "Positivo" if prediction == 1 else "Negativo"

# Exemplo de uso
print(predict_review("The team is very collaborative, and the work-life balance is great."))
print()
print(predict_review("It is a bad place to work"))


Texto limpo: team collaborative worklife balance great
Vetorizado: [[0. 0. 0. ... 0. 0. 0.]]
Probabilidade da classe positiva: 0.9723
Predição (com threshold=0.5): 1
Positivo

Texto limpo: bad place work
Vetorizado: [[0. 0. 0. ... 0. 0. 0.]]
Probabilidade da classe positiva: 0.4407
Predição (com threshold=0.5): 0
Negativo


In [30]:
#Salvando os documentos

import joblib

# Supondo que model e vectorizer (tfidf) já estejam treinados
joblib.dump(model, "modelo_sentimento.pkl")
joblib.dump(vectorizer, "vetorizador_tfidf.pkl")


['vetorizador_tfidf.pkl']

### Visualização e Interface

In [31]:
import gradio as gr
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

# Certifique-se de ter feito isso antes (no seu outro arquivo):
# joblib.dump(model, "modelo_sentimento.pkl")
# joblib.dump(tfidf, "vetorizador_tfidf.pkl")

# Carregando o modelo e o vetor TF-IDF
model = joblib.load("modelo_sentimento.pkl")
tfidf = joblib.load("vetorizador_tfidf.pkl")

# Baixar recursos necessários
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Função de pré-processamento
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Função para a interface
def classificar_review(review_text):
    texto_limpo = preprocess(review_text)
    vetor = tfidf.transform([texto_limpo])
    pred = model.predict(vetor)[0]
    prob = model.predict_proba(vetor).max()
    return f"Sentimento: {'Positivo' if pred == 1 else 'Negativo'} (confiança: {prob:.2f})"

# Interface com Gradio
demo = gr.Interface(
    fn=classificar_review,
    inputs=gr.Textbox(lines=5, label="Digite um review da Microsoft"),
    outputs=gr.Textbox(label="Resultado da Classificação"),
    title="Classificador de Reviews - Microsoft",
    description="Este modelo classifica automaticamente se um review é positivo ou negativo com base em PLN."
)

# Executa o app
if __name__ == "__main__":
    demo.launch()


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rrs4_cesar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rrs4_cesar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rrs4_cesar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
