In [1]:
# https://linogaliana-teaching.netlify.app/lda/
# import numpy as np
import pandas as pd
import position_mapping
import sentiment_classification
from collections import Counter

# from spacy.lang.fr import French
from spacy.lang.fr.stop_words import STOP_WORDS
import spacy
from nltk import ngrams
from collections import Counter
import re
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='french')

# nlp = French()
nlp = spacy.load("fr_core_news_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# load datasets
df_reviews = pd.read_excel("data/reviews_data.xlsx")


In [3]:
def text_processing(text):

    doc = nlp(text)
    # lemmatization
    # lems = [tok.lemma_ for tok in doc if tok.text not in STOP_WORDS and tok.pos_ in ["NOUN", "ADJ", "ADV", "VERB"]]
    lems = [stemmer.stem(str(tok.text)) for tok in doc if tok.text not in STOP_WORDS and tok.pos_ in ["NOUN", "ADJ", "ADV", "VERB"]]
    # lems = [tok.text for tok in doc if tok.text not in STOP_WORDS and tok.pos_ in ["NOUN", "ADJ", "ADV", "VERB"]]
    # pos tagging
    return " ".join(lems)

In [9]:
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 200

df_reviews["cons"].head()

0                            - rien de spécifique (comme n'importe quel cabinet de conseil)
1    Organisation hiérarchique - orienté résultat au détriment de l’évolution / progression
2                                                  Peu d'intéressement et de participation.
3                            Travail prenant, attention à bien respecter la vie perso / pro
4         La charge de travail peut augmenter lorsqu'on nous demande des activités internes
Name: cons, dtype: object

In [10]:
## all pros
all_pros = (
    [
        pro.replace("- ", "").replace("/", " ").strip() 
        for i in df_reviews["pros"].apply(lambda pros: re.split(r"\n|\r|,", pros)).tolist() 
        for pro in i
    ]
)
from unidecode import unidecode

all_pros_processed = [
    unidecode(" ".join([tok.lemma_.lower() for tok in nlp(pros) if tok.pos_ in ["NOUN", "ADJ", "ADV", "PROPN"]])) 
    # unidecode(" ".join([stemmer.stem(str(tok.text)) for tok in nlp(pros) if tok.pos_ in ["NOUN", "ADJ", "ADV", "PROPN"]])) 
    for pros in all_pros
]

all_pros_processed_with_ngrams = []

for pros in all_pros_processed:
    if len(pros.split(" ")) >= 3:
        all_pros_processed_with_ngrams = all_pros_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(pros.split(" "), 3) if pros != ""]
    elif len(pros.split(" ")) >= 2:
        all_pros_processed_with_ngrams = all_pros_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(pros.split(" "), 2) if pros != ""]
    else:
        all_pros_processed_with_ngrams = all_pros_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(pros.split(" "), 1) if pros != ""]

# show all bigram
all_pros_processed_with_ngrams = [x for x in all_pros_processed_with_ngrams if x != ""]
Counter(all_pros_processed_with_ngrams).most_common(20)

[('mission interessant', 9),
 ('bon ambiance', 8),
 ('salaire', 6),
 ('autonomie', 6),
 ('diversite mission', 5),
 ('management proximite', 4),
 ('ambiance', 4),
 ('tres bon ambiance', 4),
 ('mission', 4),
 ('equilibre vie pro', 3),
 ('proximite management', 3),
 ('afterwork', 3),
 ('cabinet fort croissance', 3),
 ('cabinet taille humain', 2),
 ('taille humain', 2),
 ('dynamisme', 2),
 ('proximite management direction', 2),
 ('apprentissage rapide', 2),
 ('sein equipe mission', 2),
 ('entreprise taille humain', 2)]

In [11]:
## all cons
all_cons = (
    [
        pro.replace("- ", "").replace("/", " ").strip() 
        for i in df_reviews["cons"].apply(lambda cons: re.split(r"\n|\r|,", cons)).tolist() 
        for pro in i
    ]
)
from unidecode import unidecode

all_cons_processed = [
    unidecode(" ".join([tok.lemma_.lower() for tok in nlp(cons) if tok.pos_ in ["NOUN", "ADJ", "ADV", "PROPN"]])) 
    for cons in all_cons
]

all_cons_processed_with_ngrams = []

for cons in all_cons_processed:
    if len(cons.split(" ")) >= 4:
        all_cons_processed_with_ngrams = all_cons_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(cons.split(" "), 3) if cons != ""]
    elif len(cons.split(" ")) >= 3:
        all_cons_processed_with_ngrams = all_cons_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(cons.split(" "), 3) if cons != ""]
    elif len(cons.split(" ")) >= 2:
        all_cons_processed_with_ngrams = all_cons_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(cons.split(" "), 2) if cons != ""]
    else:
        all_cons_processed_with_ngrams = all_cons_processed_with_ngrams + [" ".join(bigram) for bigram in ngrams(cons.split(" "), 1) if cons != ""]

# show all bigram
all_cons_processed_with_ngrams = [x for x in all_cons_processed_with_ngrams if x != ""]
Counter(all_cons_processed_with_ngrams).most_common(20)

[('management', 5),
 ('equilibre vie pro', 3),
 ('vie pro vie', 3),
 ('type mission', 3),
 ('equilibre vie professionnel', 3),
 ('mission ne pas', 2),
 ('ne pas tout', 2),
 ('travail', 2),
 ('vie professionnel vie', 2),
 ('pas', 2),
 ('directeur manager', 2),
 ("specifique n' cabinet", 1),
 ("n' cabinet conseil", 1),
 ('organisation hierarchique resultat', 1),
 ('hierarchique resultat detriment', 1),
 ('resultat detriment evolution', 1),
 ('detriment evolution progression', 1),
 ('peu interessement participation', 1),
 ('travail prenant', 1),
 ('attention bien vie', 1)]

In [12]:
pros_df = pd.DataFrame(Counter(all_pros_processed_with_ngrams).most_common(), columns=["pros", "number"])
cons_df = pd.DataFrame(Counter(all_cons_processed_with_ngrams).most_common(), columns=["cons", "number"])

In [18]:
# save
# pros_df.to_excel("data/pros_df.xlsx", index=False)
# cons_df.to_excel("data/cons_df.xlsx", index=False)

pd.DataFrame(all_pros_processed_with_ngrams, columns=["pros"]).to_excel("data/pros_df.xlsx", index=False)
pd.DataFrame(all_cons_processed_with_ngrams, columns=["cons"]).to_excel("data/cons_df.xlsx", index=False)