# Sentiment basé sur les reviews

J'ai testé différent Dataset avec différents modèle de text mining pour prédire les sentiments à partir des reviews.
Une première approche en utilisant le Dataset twitter.
Une deuxième approche en utilisant nos propres jeux de données Trustpilot
Le modèle qui fonctionne le mieux pour le moment est le modèle bag CountVectorizer. Le modèle prédit assez bien pour les avis positifs et un peu moins bien pour les avis négatifs et neutre. 
Des solutions pour améliorer le modèle:
* Ajout de données pour l'entrainement
* Utiliser un modèle de text Mining Pré-entrainer sur un gros volume de données tel que Vader
* Utiliser d'autre Dataset ou d'autres modèles de text mining

# Import des librairies

In [2]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import PorterStemmer
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
from pprint import pprint
from datetime import datetime

if nltk.download('punkt') == True:
    pass
else:
    nltk.download('punkt')

if nltk.download('stopwords') == True:
    pass
else:
    nltk.download('stopwords')

if nltk.download('wordnet') == True:
    pass
else:
    nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Nettoyage et traitement jeu de données

In [3]:
tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]{4,}")


In [4]:
stop_words = set(stopwords.words('english'))
stop_words.update([".",",","?","@"])

def stop_words_filtering(l):
    for element in l:
        if element in stop_words:
            l.remove(element)
    return l

In [5]:
def lemmatisation(mots):
    wordnet_lemmatizer = WordNetLemmatizer()
    result = []
    for element in mots:
        radical = wordnet_lemmatizer.lemmatize(element, pos='v')
        if (radical not in result):
            result.append(radical)
    return result

In [6]:
def stemming(mots) :
    stemmer = PorterStemmer()
    sortie = []
    for string in mots :
        radical = stemmer.stem(string)
        if (radical not in sortie) : sortie.append(radical)
    return sortie

# Twitter Dataset

L'objectif est de créer un modèle capable de prédire si un texte est positif neutre ou négatif. Pour cela on va prendre un jeu de donnée contenant des reviews avec leur label. Après cet entrainement, on va tester le modèle avec nos propres données

In [7]:
path_twitter_data = f"{os.getcwd()}/train_data/Twitter_Data.csv"

In [8]:
data_twitter = pd.read_csv(path_twitter_data, encoding='ISO-8859-1')

In [9]:
data_twitter["category"] = data_twitter["category"].replace(to_replace = -1.0, value = "negatif")
data_twitter["category"] = data_twitter["category"].replace(to_replace = 1.0, value = "positif")
data_twitter["category"] = data_twitter["category"].replace(to_replace = 0.0, value = "neutre")

In [10]:
data_twitter = data_twitter.dropna()

In [11]:
data_twitter["category"].value_counts()

category
positif    72249
neutre     55211
negatif    35509
Name: count, dtype: int64

In [12]:
data_twitter_positif = data_twitter[data_twitter['category'] == 'positif'].sample(n=35509, random_state=42)
data_twitter_neutre = data_twitter[data_twitter['category'] == 'neutre'].sample(n=35509, random_state=42)
data_twitter_negatif = data_twitter[data_twitter['category'] == 'negatif']

In [13]:
data_twitter = pd.concat([data_twitter_positif, data_twitter_negatif,data_twitter_neutre])

In [14]:
data_twitter["category"].value_counts()

category
positif    35509
negatif    35509
neutre     35509
Name: count, dtype: int64

In [15]:
print(f"On a en tout {len(data_twitter)} lignes pour notre modèle")

On a en tout 106527 lignes pour notre modèle


# Algo BAG avec CountVectorizer

On va prendre importer nos données d'entrainements et on va ensuite diviser en jeu d'entrainement et de test

In [16]:
model_name = "modelCV_twitter.pkl"
if os.path.exists(f"{os.getcwd()}/model/{model_name}"):

    vectorizer, model_clf_CV_twitter = joblib.load(f"{os.getcwd()}/model/{model_name}")
    X_test, y_test = joblib.load(f"{os.getcwd()}/model/test_data_twitter_CV.pkl")

else:
    X = data_twitter["clean_text"]
    y = data_twitter["category"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)

    X_train = X_train.str.lower().apply(tokenizer.tokenize)
    X_test = X_test.str.lower().apply(tokenizer.tokenize)
    
    X_train = X_train.apply(stop_words_filtering)
    X_test = X_test.apply(stop_words_filtering)
    
    X_train = X_train.apply(lemmatisation)
    X_test = X_test.apply(lemmatisation)
    
    X_train = X_train.apply(str)
    X_test = X_test.apply(str)

    #application du CountVectorizer sur nos jeux d'entrainements et de tests
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    model_clf_CV_twitter = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42).fit(X_train, y_train)
    joblib.dump((vectorizer,model_clf_CV_twitter), f"{os.getcwd()}/model/{model_name}")
    joblib.dump((X_test, y_test), f"{os.getcwd()}/model/test_data_twitter_CV.pkl")
    

In [17]:
y_pred = model_clf_CV_twitter.predict(X_test)

In [18]:
print( classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     negatif       0.85      0.72      0.78      7021
      neutre       0.70      0.95      0.81      7200
     positif       0.87      0.69      0.77      7085

    accuracy                           0.79     21306
   macro avg       0.81      0.79      0.79     21306
weighted avg       0.81      0.79      0.79     21306



Cette fonction prends en entrée un pandas Serie de texte et le modèle et renvoie en sortie une liste avec nos prédictions

In [19]:
def extract_sentiment_cv(reviews,model_CV):
    tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]{4,}")
    result = []
    reviews = reviews.str.lower().apply(tokenizer.tokenize)
    reviews = reviews.apply(stop_words_filtering)
    reviews = reviews.apply(lemmatisation)
    reviews = reviews.apply(str)
    for text in reviews:
        new_text_vectorized = vectorizer.transform([text])

        prediction = model_CV.predict(new_text_vectorized)
        result.append(prediction[0])
    return result


# Algo BAG avec TFIDF

La même chose avec tfidf

In [20]:
model_name = "modeltfidf_twitter.pkl"
if os.path.exists(f"{os.getcwd()}/model/{model_name}"):

    vec_tfidf, model_CLF_tfidf_twitter = joblib.load(f"{os.getcwd()}/model/{model_name}")
    X_test_tfidf, y_test_tfidf = joblib.load(f"{os.getcwd()}/model/test_data_twitter_tfidf.pkl")

else:
    X = data_twitter["clean_text"]
    y = data_twitter["category"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)

    X_train = X_train.str.lower().apply(tokenizer.tokenize)
    X_test = X_test.str.lower().apply(tokenizer.tokenize)
    
    X_train = X_train.apply(stop_words_filtering)
    X_test = X_test.apply(stop_words_filtering)
    
    X_train = X_train.apply(lemmatisation)
    X_test = X_test.apply(lemmatisation)
    
    X_train = X_train.apply(str)
    X_test = X_test.apply(str)

    #application du TfidfVectorizer sur nos jeux d'entrainements et de tests
    vec_tfidf = TfidfVectorizer()
    X_train_tfidf = vec_tfidf.fit_transform(X_train)
    X_test_tfidf = vec_tfidf.transform(X_test)

    model_CLF_tfidf_twitter = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42).fit(X_train_tfidf, y_train)
    joblib.dump((vec_tfidf, model_CLF_tfidf_twitter), f"{os.getcwd()}/model/{model_name}")
    joblib.dump((X_test_tfidf, y_test), f"{os.getcwd()}/model/test_data_twitter_tfidf.pkl")
    

In [21]:
y_pred_tfidf = model_CLF_tfidf_twitter.predict(X_test_tfidf)

In [22]:
print(classification_report(y_test, y_pred_tfidf))

              precision    recall  f1-score   support

     negatif       0.83      0.71      0.76      7021
      neutre       0.70      0.95      0.80      7200
     positif       0.87      0.67      0.75      7085

    accuracy                           0.78     21306
   macro avg       0.80      0.78      0.77     21306
weighted avg       0.80      0.78      0.77     21306



In [23]:
def extract_sentiment_tfidf(reviews,model_tfidf):
    tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]{4,}")
    result = []
    reviews = reviews.str.lower().apply(tokenizer.tokenize)
    reviews = reviews.apply(stop_words_filtering)
    reviews = reviews.apply(lemmatisation)
    reviews = reviews.apply(str)
    for text in reviews:

        new_text_vectorized = vec_tfidf.transform([text])

        prediction = model_tfidf.predict(new_text_vectorized)
        result.append(prediction[0])
    return result

# Sentiments Analysis avec Trustpilot

On va se servir de nos propres jeux de données truspilot dans ce cas

In [24]:
path_trustpilot_data = f"{os.getcwd()}/train_data/firms_with_reviews.csv"

In [25]:
df = pd.read_csv(path_trustpilot_data, index_col=0 ,encoding='ISO-8859-1')

Ici nous avons entrainé notre modèle avec nos propres données que l'on a scrappé sur truspilot

Un petit aperçu de nos données

In [26]:
df.head()

Unnamed: 0_level_0,review_title,author_id,experience_date,extract_date,firm_id,review_date,review_note,review_text,review_url,author_localisation,author_name,author_url,firm_name,reponse,note,firm_info
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
66ccad2d63a995b7aa02bc03,Alvaro made my experience veryâ¦,66ccad2d63a995b7aa02bc01,2023-05-22T00:00:00,2024-08-23T00:00:00,turbodebt.com,2023-05-23T00:00:00,5.0,Alvaro made my experience very satisfactory! I...,/reviews/646cba0dc423446286686604,US,Keontra Reid,/users/646cba0ba8905b00124cdbfb,TurboDebt,True,5.0,"{'_id': '66db33a2bd030051b2456e59', 'page_url'..."
66ccad2d63a995b7aa02bc08,Great company to work with they reallyâ¦,66ccad2d63a995b7aa02bc06,2023-05-22T00:00:00,2024-08-23T00:00:00,turbodebt.com,2023-05-23T00:00:00,5.0,Great company to work with they really underst...,/reviews/646cad6cc423446286685c19,US,Ismael Luciano,/users/646cad6b05330f0014134602,TurboDebt,True,5.0,"{'_id': '66db33a2bd030051b2456e59', 'page_url'..."
66ccad2d63a995b7aa02bc0d,HELPFUL..,66ccad2d63a995b7aa02bc0b,2023-05-22T00:00:00,2024-08-23T00:00:00,turbodebt.com,2023-05-23T00:00:00,5.0,HELPFUL... HONEST...TRUTHUL!!!!!!!!!!!!!!!!!!!,/reviews/646c8127706f837cb1eff2f6,US,Tony RODRIGUEZ,/users/646c8126a8905b00124cad7c,TurboDebt,True,5.0,"{'_id': '66db33a2bd030051b2456e59', 'page_url'..."
66ccad2d63a995b7aa02bc12,Making it very easy to understand andâ¦,66ccad2d63a995b7aa02bc10,2023-05-22T00:00:00,2024-08-23T00:00:00,turbodebt.com,2023-05-23T00:00:00,5.0,Making it very easy to understand and answerin...,/reviews/646c6c27706f837cb1efe4ad,US,Melinda Hall,/users/646c6c254be4ac0013350e3c,TurboDebt,True,5.0,"{'_id': '66db33a2bd030051b2456e59', 'page_url'..."
66ccad2d63a995b7aa02bc17,Leif was awesome,66ccad2d63a995b7aa02bc15,2023-05-22T00:00:00,2024-08-23T00:00:00,turbodebt.com,2023-05-23T00:00:00,5.0,"Leif was awesome. Hes so friendly, easy to tal...",/reviews/646c4e46706f837cb1efd615,US,SDS,/users/646c4e454be4ac001334fb5f,TurboDebt,True,5.0,"{'_id': '66db33a2bd030051b2456e59', 'page_url'..."


In [27]:
print(f"Nous avons en tout {len(df)} lignes")

Nous avons en tout 214647 lignes


In [28]:
df = df.dropna()
print(f"Après avoir supprimé les NaN nous avons {len(df)} lignes")

Après avoir supprimé les NaN nous avons 188096 lignes


In [29]:
df["sentiments"] = df["note"]

In [30]:
df["sentiments"] = df["sentiments"].replace(to_replace = [2.0,1.0], value = "negatif")
df["sentiments"] = df["sentiments"].replace(to_replace = [4.0,5.0], value = "positif")
df["sentiments"] = df["sentiments"].replace(to_replace = [3.0], value = "neutre")

In [31]:
df["sentiments"].value_counts()

sentiments
positif    176457
negatif      7323
neutre       4316
Name: count, dtype: int64

In [32]:
df_positif = df[df['sentiments'] == 'positif'].sample(n=8000, random_state=42)
df_negatif = df[df['sentiments'] == 'negatif']
df_neutre = df[df['sentiments'] == 'neutre']

In [33]:
df = pd.concat([df_positif, df_negatif, df_neutre])

Notre jeux de données est déjà plus équilibré

In [34]:
df["sentiments"].value_counts()

sentiments
positif    8000
negatif    7323
neutre     4316
Name: count, dtype: int64

In [35]:
model_name = "modelCV_trustpilot.pkl"
if os.path.exists(f"{os.getcwd()}/model/{model_name}"):

    vectorizer, model_clf_CV_trustpilot = joblib.load(f"{os.getcwd()}/model/{model_name}")
    X_test, y_test = joblib.load(f"{os.getcwd()}/model/test_data_trustpilot_CV.pkl")

else:
    X = df["review_text"]
    y = df["sentiments"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)

    X_train = X_train.str.lower().apply(tokenizer.tokenize)
    X_test = X_test.str.lower().apply(tokenizer.tokenize)
    
    X_train = X_train.apply(stop_words_filtering)
    X_test = X_test.apply(stop_words_filtering)
        
    X_train = X_train.apply(lemmatisation)
    X_test = X_test.apply(lemmatisation)
        
    X_train = X_train.apply(str)
    X_test = X_test.apply(str)
    
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    model_clf_CV_trustpilot = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42).fit(X_train, y_train)
    joblib.dump((vectorizer,model_clf_CV_trustpilot), f"{os.getcwd()}/model/{model_name}")
    joblib.dump((X_test, y_test), f"{os.getcwd()}/model/test_data_trustpilot_CV.pkl")
    


In [36]:
y_pred = model_clf_CV_trustpilot.predict(X_test)

In [37]:
print( classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     negatif       0.73      0.76      0.74      1435
      neutre       0.55      0.36      0.44       888
     positif       0.79      0.91      0.84      1605

    accuracy                           0.73      3928
   macro avg       0.69      0.68      0.68      3928
weighted avg       0.71      0.73      0.72      3928



In [38]:
classification_report(y_test, y_pred,output_dict=True)["accuracy"]

0.7309063136456212

In [39]:
report_metric_train = classification_report(y_test, y_pred,output_dict=True)["weighted avg"]
report_metric_train.pop("support")

3928.0

In [40]:
report_metric_train

{'precision': 0.7135863961667934,
 'recall': 0.7309063136456212,
 'f1-score': 0.7159440454624767}

In [41]:

len(df)

19639

In [42]:

current_date  = '2024-10-01'

In [43]:
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
l = [current_date, accuracy, precision, recall, f1]
columns = ['date', 'accuracy', 'precision', 'recall', 'f1_score']

In [44]:
metrics_df = pd.DataFrame([l], columns=columns)
metrics_df.to_csv('metrics.csv', index = False)

# Test du modèle avec nos jeux de données sur mongoDB

Une fois qu'on a notre modèle on l'utiliser sur nos données scrappés qui se trouvent dans la base mongodb

In [45]:
client = MongoClient(
    host = "my_mongo_server",
    port = 27017,
    username = "datascientest",
    password = "dst123")

In [46]:
pprint(client["Projet"].list_collection_names())

['Reviews', 'Firms']


In [47]:
pprint(client["Projet"]['Reviews'].find_one())

{'_id': ObjectId('66fea989b80f255a7a991a18'),
 'author_localisation': 'US',
 'author_name': 'Keontra Reid',
 'author_url': '/users/646cba0ba8905b00124cdbfb',
 'experience_date': datetime.datetime(2023, 5, 22, 0, 0),
 'extract_date': datetime.datetime(2024, 8, 23, 0, 0),
 'firm_id': 'turbodebt.com',
 'firm_name': 'TurboDebt',
 'note': 5.0,
 'reponse': 'True',
 'review_date': datetime.datetime(2023, 5, 23, 0, 0),
 'review_text': 'Alvaro made my experience very satisfactory! I felt like I '
                'could breathe again after speaking with him.',
 'review_title': 'Alvaro made my experience very…',
 'review_url': '/reviews/646cba0dc423446286686604'}


In [48]:
print(client.list_database_names())

['Projet', 'admin', 'config', 'local']


Liste des colonnes de notre jeux de données

In [49]:
client["Projet"]["Reviews"].find_one().keys()

dict_keys(['_id', 'review_url', 'author_localisation', 'author_name', 'author_url', 'experience_date', 'extract_date', 'firm_id', 'firm_name', 'note', 'reponse', 'review_date', 'review_text', 'review_title'])

In [50]:
def extract_sentiment_cv(reviews,model_CV):
    result = []
    reviews = reviews.str.lower().apply(tokenizer.tokenize)
    reviews = reviews.apply(stop_words_filtering)
    reviews = reviews.apply(lemmatisation)
    reviews = reviews.apply(str)
    for text in reviews:
        new_text_vectorized = vectorizer.transform([text])

        prediction =  model_clf_CV_trustpilot.predict(new_text_vectorized)
        result.append(prediction[0])
    return result

In [51]:
test = pd.read_csv("data_final.csv")

In [52]:
test[:5000].to_csv('data_final.csv', index = False)

In [55]:
data_existant = pd.read_csv('data_final.csv') 

In [56]:
existant_ids = set(data_existant['review_url'])

In [57]:
new_data_metric = client["Projet"]["Reviews"].find({"review_url": {"$nin": list(existant_ids)}},{"_id": 0, 'Unnamed: 0': 0})

In [58]:
df_new_data_metric = pd.DataFrame(list(new_data_metric))

In [59]:
df_new_data_metric["sentiments"] = df_new_data_metric["note"]
df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [2.0,1.0], value = "negatif")
df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [4.0,5.0], value = "positif")
df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [3.0], value = "neutre")

In [60]:
y_test = df_new_data_metric["sentiments"]

In [61]:
y_pred = extract_sentiment_cv(df_new_data_metric["review_text"], model_clf_CV_trustpilot)

In [62]:
from datetime import datetime
current_date = datetime.now().strftime('%Y-%m-%d')

In [63]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [64]:
new_data_metrics = {
    'date': current_date,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}

In [65]:
new_data_metrics = pd.DataFrame([new_data_metrics]) 

In [66]:
metrics_df = pd.concat([metrics_df, new_data_metrics], ignore_index=True)

In [53]:
metrics_df

Unnamed: 0,date,accuracy,precision,recall,f1_score
0,2024-10-01,0.730906,0.713586,0.730906,0.715944
1,2024-10-03,0.899154,0.946096,0.899154,0.918364


In [53]:
if os.path.isfile(f"{os.getcwd()}/data_final.csv"):
    data_existing = pd.read_csv('data_final.csv')   
    existing_ids = set(data_existing['review_url'])
    new_data = client["Projet"]["Reviews"].find({"review_url": {"$nin": list(existing_ids)}},{"_id": 0, 'Unnamed: 0': 0})
    df_new_data = pd.DataFrame(list(new_data))
    df_new_data_metric = df_new_data.copy()
    if len(df_new_data) == 0:
        print("Aucune nouvelles données")
    else:
        print("le fichier existe et il y a de nouvelles données")
        df_new_data["sentiments"] = extract_sentiment_cv(df_new_data["review_text"], model_clf_CV_trustpilot)
        y_pred = df_new_data["sentiments"].copy()
        df_new_data_metric["sentiments"] = df_new_data_metric["note"]
        df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [2.0,1.0], value = "negatif")
        df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [4.0,5.0], value = "positif")
        df_new_data_metric["sentiments"] = df_new_data_metric["sentiments"].replace(to_replace = [3.0], value = "neutre")
        y_test = df_new_data_metric["sentiments"]
        #y_pred = extract_sentiment_cv(df_new_data_metric["review_text"], model_clf_CV_trustpilot)
        current_date = datetime.now().strftime('%Y-%m-%d')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        current_date = datetime.now().strftime('%Y-%m-%d')
        new_data_metrics = {
            'date': current_date,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        new_data_metrics = pd.DataFrame([new_data_metrics]) 
        metrics_df = pd.concat([metrics_df, new_data_metrics], ignore_index=True)
        df_new_data['review_text'] = df_new_data['review_text'].replace('\r', ' ', regex=True)
        df_new_data["reponse"] = df_new_data["reponse"].astype(bool)
        df_new_data["reponse"] = df_new_data["reponse"].replace({True: "true", False: "false"})
        data = pd.concat([data_existing, df_new_data], ignore_index=True)
        data["reponse"] = data["reponse"].replace({True: "true", False: "false"})
        data['experience_date'] = pd.to_datetime(data['experience_date']).dt.strftime('%Y-%m-%d')
        data['extract_date'] = pd.to_datetime(data['extract_date']).dt.strftime('%Y-%m-%d')
        data['review_date'] = pd.to_datetime(data['review_date']).dt.strftime('%Y-%m-%d')
        data.to_csv('data_final.csv', index = False)
        metrics_df.to_csv('metrics.csv', index = False)
else:
    print("data_final.csv n'existe pas ")
    full_data = []
    for element in client["Projet"]["Reviews"].find({},{ "_id":0, 'Unnamed: 0':0 }):
        full_data.append(element)
    data = pd.DataFrame(full_data)
    data = data.dropna(subset=['review_text'])
    data["sentiments"] = extract_sentiment_cv(data["review_text"], model_clf_CV_trustpilot)
    data['review_text'] = data['review_text'].replace('\r', ' ', regex=True)
    data["reponse"] = data["reponse"].astype(bool)
    #data["reponse"] = data["reponse"].replace(to_replace = True, value = "true")
    #data["reponse"] = data["reponse"].replace(to_replace = False, value = "false")
    data["reponse"] = data["reponse"].replace({True: "true", False: "false"})
    data['experience_date'] = pd.to_datetime(data['experience_date']).dt.strftime('%Y-%m-%d')
    data['extract_date'] = pd.to_datetime(data['extract_date']).dt.strftime('%Y-%m-%d')
    data['review_date'] = pd.to_datetime(data['review_date']).dt.strftime('%Y-%m-%d')

    data.to_csv('data_final.csv', index = False)
    metrics_df.to_csv('metrics.csv', index = False)

le fichier existe et il y a de nouvelles données


if os.path.isfile(f"{os.getcwd()}/data_final.csv"):
    data_existing = pd.read_csv('data_final.csv')   
    existing_ids = set(data_existing['review_url'])
    new_data = client["Projet"]["Reviews"].find({"review_url": {"$nin": list(existing_ids)}},{"_id": 0, 'Unnamed: 0': 0})
    df_new_data = pd.DataFrame(list(new_data))
    if len(df_new_data) == 0:
        print("Aucune nouvelles données")
    else:
        print("le fichier existe et il y a de nouvelles données")
        df_new_data["sentiments"] = extract_sentiment_cv(df_new_data["review_text"], model_clf_CV_trustpilot)
        df_new_data['review_text'] = df_new_data['review_text'].replace('\r', ' ', regex=True)
        df_new_data["reponse"] = df_new_data["reponse"].astype(bool)
        df_new_data["reponse"] = df_new_data["reponse"].replace({True: "true", False: "false"})
        data = pd.concat([data_existing, df_new_data], ignore_index=True)
        data["reponse"] = data["reponse"].replace({True: "true", False: "false"})
        data['experience_date'] = pd.to_datetime(data['experience_date']).dt.strftime('%Y-%m-%d')
        data['extract_date'] = pd.to_datetime(data['extract_date']).dt.strftime('%Y-%m-%d')
        data['review_date'] = pd.to_datetime(data['review_date']).dt.strftime('%Y-%m-%d')
        data.to_csv('data_final.csv', index = False)
else:
    print("data_final.csv n'existe pas ")
    full_data = []
    for element in client["Projet"]["Reviews"].find({},{ "_id":0, 'Unnamed: 0':0 }):
        full_data.append(element)
    data = pd.DataFrame(full_data)
    data = data.dropna(subset=['review_text'])
    data["sentiments"] = extract_sentiment_cv(data["review_text"], model_clf_CV_trustpilot)
    data['review_text'] = data['review_text'].replace('\r', ' ', regex=True)
    data["reponse"] = data["reponse"].astype(bool)
    #data["reponse"] = data["reponse"].replace(to_replace = True, value = "true")
    #data["reponse"] = data["reponse"].replace(to_replace = False, value = "false")
    data["reponse"] = data["reponse"].replace({True: "true", False: "false"})
    data['experience_date'] = pd.to_datetime(data['experience_date']).dt.strftime('%Y-%m-%d')
    data['extract_date'] = pd.to_datetime(data['extract_date']).dt.strftime('%Y-%m-%d')
    data['review_date'] = pd.to_datetime(data['review_date']).dt.strftime('%Y-%m-%d')

    data.to_csv('data_final.csv', index = False)
        

In [80]:
data.head()

Unnamed: 0,review_url,author_localisation,author_name,author_url,experience_date,extract_date,firm_id,firm_name,note,reponse,review_date,review_text,review_title,sentiments
0,/reviews/646cba0dc423446286686604,US,Keontra Reid,/users/646cba0ba8905b00124cdbfb,2023-05-22,2024-08-23,turbodebt.com,TurboDebt,5.0,True,2023-05-23,Alvaro made my experience very satisfactory! I...,Alvaro made my experience very…,positif
1,/reviews/646cad6cc423446286685c19,US,Ismael Luciano,/users/646cad6b05330f0014134602,2023-05-22,2024-08-23,turbodebt.com,TurboDebt,5.0,True,2023-05-23,Great company to work with they really underst...,Great company to work with they really…,positif
2,/reviews/646c8127706f837cb1eff2f6,US,Tony RODRIGUEZ,/users/646c8126a8905b00124cad7c,2023-05-22,2024-08-23,turbodebt.com,TurboDebt,5.0,True,2023-05-23,HELPFUL... HONEST...TRUTHUL!!!!!!!!!!!!!!!!!!!,HELPFUL..,positif
3,/reviews/646c6c27706f837cb1efe4ad,US,Melinda Hall,/users/646c6c254be4ac0013350e3c,2023-05-22,2024-08-23,turbodebt.com,TurboDebt,5.0,True,2023-05-23,Making it very easy to understand and answerin...,Making it very easy to understand and…,positif
4,/reviews/646c4e46706f837cb1efd615,US,SDS,/users/646c4e454be4ac001334fb5f,2023-05-22,2024-08-23,turbodebt.com,TurboDebt,5.0,True,2023-05-23,"Leif was awesome. Hes so friendly, easy to tal...",Leif was awesome,positif


In [54]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import bulk, BulkIndexError 
import csv
import os 

es = Elasticsearch(hosts = "http://es-container:9200", timeout=3000000)

# Définir le nom de l'index
index_final = "test_final"
index_metrics = "test_metrics"
data_file = "data_final.csv"
metrics_file = "metrics.csv"
# Définir le mapping pour l'index
mapping_data = {
    "mappings": {
        "properties": {
            "author_localisation": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "author_name": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "author_url": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "experience_date": {
                "type": "date",
                "format": "yyyy-MM-dd"
            },
            "extract_date": {
                "type": "date",
                "format": "yyyy-MM-dd"
            },
            "firm_id": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "firm_name": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "note": {
                "type": "float"
            },
            "reponse": {
                "type": "boolean"
            },
            "review_date": {
                "type": "date"
            },
            "review_text": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "review_title": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "review_url": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "sentiments": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            }
        }
    }
}

mapping_metrics = {
    "mappings": {
        "properties": {
            "experience_date": {
                "type": "date",
                "format": "yyyy-MM-dd"
            },
            "accuracy": {
                "type": "float"
            },
            "precision": {
                "type": "float"
            },
            "recall": {
                "type": "float"
            },
            "f1_score": {
                "type": "float"
            }
        }
    }
}
mapping = [mapping_data, mapping_metrics]
index_name = [index_final, index_metrics]
file_name = [data_file, metrics_file]

for mapp, index, file in zip(mapping,index_name, file_name):
    # Créer l'index
    if not es.indices.exists(index=index):
        es.indices.create(index=index, body=mapp)
        print(f"L'index '{index}' a été créé avec succès.")
    else:
        print(f"L'index '{index}' existe déjà.")
    
    # Supprimer tous les documents existants dans l'index
    es.delete_by_query(index=index, body={"query": {"match_all": {}}})
    
    # Lire le fichier CSV et indexer les nouveaux documents
    with open(f"{os.path.dirname(os.getcwd())}/ML/{file}") as f:
        reader = csv.DictReader(f)
        
        try:
            helpers.bulk(es, reader, index=index)
            print("Documents ajoutés avec succès.")
        except BulkIndexError as e:
            print(f"Erreur lors de l'indexation : {e.errors}")






  es = Elasticsearch(hosts = "http://es-container:9200", timeout=3000000)


L'index 'test_final' existe déjà.
Documents ajoutés avec succès.
L'index 'test_metrics' existe déjà.
Documents ajoutés avec succès.
