# Analyse des sentiements liées aux entitées
On souhaite ici compter le nombre de messages décrit comme "positif" et "négatif" comptenant une entitée


In [1]:
import tensorflow
import pandas as pd
import numpy as np
import json
from transformers import TFCamembertForSequenceClassification
import transformers.models.camembert.tokenization_camembert as tk

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


### Chargement des messages et des entitées

In [84]:
with open('./messages/20-04-2021_02-04-26__8225.json', encoding="utf8") as f:
  messages = json.load(f)

with open('./entities/entities_4.json', encoding="utf8") as f:
  entities = json.load(f)

### Récuparation de l'encodeur pour notre modèle

In [85]:
tokenizer = tk.CamembertTokenizer.from_pretrained("jplu/tf-camembert-base",do_lower_case=True)
assert tokenizer != None

def encode_msg(messages, tokenizer = tokenizer, max_length=80):
    token_ids = np.zeros(shape=(len(messages), max_length),
                         dtype=np.int32)
    for i, msg in enumerate(messages):
        encoded = tokenizer.encode(msg, max_length=max_length)
        token_ids[i, 0:len(encoded)] = encoded
    attention_mask = (token_ids != 0).astype(np.int32)
    return {"input_ids": token_ids, "attention_mask": attention_mask}

### Chargement de notre modèle fine-tuned

In [86]:
model = TFCamembertForSequenceClassification.from_pretrained("jplu/tf-camembert-base")
model.load_weights("./models_weights/f193_count8000_epo7_batch4_allo.h5")

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Pré-traitement de notre jeu de donnée
On écarte les messages comptenant des liens, des gif et des enregistrement vocaux. On ne garde pas non plus les messages de moins de 2 caractères ainsi que les messages à plus de 120 caractères car le modèle a été entrainé sur des messages courts.

On format aussi les données en ne gardant que le contenu et l'auteur du messages

In [87]:
# entrée (messages) { who:.., what:.., when:.., feedback:.., whatType:..},{..},{..},..
# sortie (messages_keep) [[what, who],[],..]

def estMsgValide(message):
    return (len(message) > 2
    and len(message) < 120 
    and "http" not in message 
    and "GIPHY" not in message 
    and "Tenor GIF Keyboard" not in message
    and "Écouter 0 :00 / 0 :00" not in message)

messages_keep = [[m['what'],m['who']] for m in messages if estMsgValide(m['what'])]

df = pd.DataFrame(messages_keep,columns=['messages','auteur'])
print(f"{len(df)} messages gardés")

6405 messages gardés


In [None]:
### Prédiction des sentiements avec notre modèle

In [88]:
# sortie [[what,who,sentiment],..]

messages_array = df.iloc[:,0].values
encoded_messages = encode_msg(messages_array)

scores = model.predict(encoded_messages)
sent_pred = np.argmax(scores['logits'], axis=1)
df['sentiment'] = sent_pred

df.head()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1    4511
0    1894
Name: sentiment, dtype: int64

In [107]:
mneg = df.iloc[:,2].value_counts(0)[0]
mpos = df.iloc[:,2].value_counts(0)[1]
tot = mpos+mneg
print(f"{round(100 * mpos/tot,1)}% messages positifs ({mpos})")
print(f"{round(100 * mneg/tot,1)}% messages négatifs ({mneg})")

70.4% messages positifs (4511)
29.6% messages négatifs (1894)


### Coloration des entitées
Association pour chaque entitées(groupe de mot) du nombre de messagse positif et négatif comptenant un des mots liées à l'entitée

In [90]:
# sortie [{entities: [n_positif,n_negatif,diff] }]

def containOneOf(message,elements):
    for e in elements:
        if e in message.lower():
            return True
    return False

entities_with_sentiment = {}

n_sans_entity = 0
for row in df.to_numpy():
    sans_entity = True
    for ent in entities:
        if containOneOf(row[0],entities[ent]):
            arr = np.array([0,0,0])
            arr[row[2]] = 1
            if ent not in entities_with_sentiment:
                entities_with_sentiment[ent] = arr.tolist()
            else:
                entities_with_sentiment[ent] = (np.array(arr)+np.array(entities_with_sentiment[ent])).tolist()
            sans_entity = False
    if sans_entity:
        n_sans_entity += 1

print(f"Nombre de message contenant une entitée: {len(messages_keep)-n_sans_entity}")

# ajout de la différence sentiement pos - neg
for ent in entities_with_sentiment:
    entities_with_sentiment[ent][2] = entities_with_sentiment[ent][1] - entities_with_sentiment[ent][0] 

# sauvegarde de l'analyse
with open('analyse.json', 'w', encoding="utf8") as fout:
    json.dump(entities_with_sentiment, fout, ensure_ascii=False)

Nombre de message contenant des entitées: 1656


### Convertion du dict en DataFrame pour visualiser le résultat
Le résultat est visualisable dans df_senti

In [101]:
# conversion dictionnaire en list
entities_with_sentiment_list = []
for key in entities_with_sentiment:
    temp = [key,entities_with_sentiment[key][0],entities_with_sentiment[key][1],entities_with_sentiment[key][2]]
    entities_with_sentiment_list.append(temp)

# conversion list en dataframe + trie
df_senti = pd.DataFrame(entities_with_sentiment_list,columns=["entity","negatif","positif","diff"])

df_senti.sort_values(by=["diff"],ascending=False)
index = df_senti[df_senti["positif"]+df_senti["negatif"] <= 2].index
df_senti = df_senti.drop(index)
print(f"{len(df_senti)} entitées apparaissent au moins 3 fois dans le corpus")
df_senti

117


Unnamed: 0,entity,negatif,positif,diff
0,projet,8,15,7
1,google,7,22,15
2,albane,4,20,16
3,alban,6,24,18
4,virus,6,12,6
...,...,...,...,...
152,ricardo,1,3,2
153,airbnb,3,5,2
165,lespinet,1,3,2
168,atelier,0,4,4


In [61]:
def trie_fusion(L):
    if len(L) <= 1:
        return L
    else:
        n = int(len(L)/2)
        return fusion(trie_fusion(L[:n]),trie_fusion(L[n:]))

def fusion(A,B):
    if len(A) == 0:
        return B
    elif len(B) == 0:
        return A
    elif A[0] <= B[0]:
        return [A[0]] + fusion(A[1:],B)
    else:
        return [B[0]] + fusion(A,B[1:])

trie_fusion(L)


[0, 1, 1, 2, 4, 4, 7, 9]