In [1]:
import pandas as pd
import numpy as np
import warnings 
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from gensim.corpora import Dictionary
from datetime import datetime

In [2]:
import pyLDAvis
import pyLDAvis.gensim_models

In [3]:
warnings.filterwarnings('ignore')

In [4]:
#data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip', index_col=[0])

In [5]:
data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip')

In [6]:
data.columns

Index(['Id', 'Title', 'Title_tokens', 'Body', 'Body_tokens', 'Tags',
       'Tag_token', 'Score', 'AnswerCount'],
      dtype='object')

In [7]:
data.shape

(50000, 9)

In [8]:
X = data[['Title_tokens', 'Body_tokens','Tag_token','Score', 'AnswerCount']]
y = data[['Tag_token']]

# re tokenisations des variables tockens suite à la lecture csv qui lit champe en string,  mise à jour aussi dans dataframe data. 
X['Title_tokens'] = X['Title_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Title_tokens'] = X['Title_tokens'].apply(nltk.word_tokenize)
X['Body_tokens'] = X['Body_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Body_tokens'] = X['Body_tokens'].apply(nltk.word_tokenize)
X['Tag_token'] = X['Tag_token'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Tag_token'] = X['Tag_token'].apply(nltk.word_tokenize)

# Split the Title_tokens into train and test sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

X_train_title = X_train['Title_tokens']
X_test_title = X_test['Title_tokens']
X_train_body = X_train['Body_tokens']
X_test_body = X_test['Body_tokens']
X_train_tag = X_train['Tag_token']
X_test_tag = X_test['Tag_token']

In [9]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [10]:
X_test_tag.head()

33553         [javascript, jquery, html, css, tablesorter]
9427               [php, mysql, file, codeigniter, upload]
199      [python, python, x, csv, dictionary, multidime...
12447               [php, net, frameworks, ldap, openldap]
39489    [vue, js, vuejs, datepicker, vue, component, v...
Name: Tag_token, dtype: object

# 3. Modèles non supervisés

Nous utilisons LDA comme le modèle non supervisé.  LDA entrainé est utilisée pour découvrir des sujets cachés dans les nouveaux documents. 
Les mots les plus probables dans ces sujets découverts sont généralement considérés comme les mots-clés les plus pertinents pour le document.

In [11]:
performance_list = []
pyLDAvis.enable_notebook()

# 3.1 Entrainement et nombre de topics

In [12]:
def lda_train(num_topics, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

### 3.1.1 Modèle 1: LDA entrainement avec titre

In [13]:
num_topics = 10
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [14]:
num_topics = 20
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [15]:
num_topics = 30
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [16]:
num_topics = 50
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

### 3.1.2 Modèle 2: LDA entrainement avec titre + body

In [17]:
X_train_merged = X_train_title + X_train_body

In [18]:
X_train_merged.head()

39087    [stored, procedure, function, calling, stored,...
30893    [run, fly, tomcat, netbeans, maven, web, proje...
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653    [validation, yii, validation, yii, advanced, p...
dtype: object

In [19]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [20]:
X_train_body.head()

39087    [function, calling, stored, procedure, execute...
30893    [maven, web, project, netbeans, eclipse, refer...
45278    [written, test, ipad, app, contains, split, vi...
16398    [php, developer, almost, nothing, net, asked, ...
13653    [validation, yii, advanced, parent_id, creatin...
Name: Body_tokens, dtype: object

In [21]:
num_topics = 10
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
vis_merged

In [22]:
num_topics = 20
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [23]:
num_topics = 30
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [24]:
num_topics = 50
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

Nous sélectionnons le model avec 20 topics avec titre

### 3.1.3  Modèle 3: LDA entrainement avec titre & tag et body

In [25]:
def lda_train_tag(num_topics, dictionary, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    # dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

In [26]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [27]:
num_topics = 10
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [28]:
num_topics = 20
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [29]:
num_topics = 30
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [30]:
num_topics = 50
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [31]:
train_result = pd.DataFrame(performance_list, columns=['Date', 'Modele', 'nb topics', 'perplexity', 'coherence_score'])
train_result.to_csv('lda_train_result.csv')

### 3.1.4 LDA entrainement avec titre pour fit et titre+body pour transform

In Gensim, the topic modeling models such as LdaModel do not have separate fit and transform methods like in some other machine learning libraries. Instead, the training and transformation steps are combined into a single process.

### 3.1.5 Example des topics découverts 

In [32]:
num_topics = 10

In [33]:
# Create a dictionary and bag-of-words representation of the training data, modèle 1: 
train_dictionary = Dictionary(X_train_title)
train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train_title]

# Train LDA model with 20 topics
lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)

# topics distribution pour chaque docuement dans X_train_title
train_topics_distribution = lda_model.get_document_topics(train_bow_corpus)

In [34]:
# les 10 topics découverts avec les 8 top words
topics = lda_model.print_topics(num_words=8)
for topic in topics:
    words = topic[1].split('+')
    words = [word.split('*')[1].replace('"', '').strip() for word in words]
    print(words)

['object', 'database', 'class', 'property', 'method', 'key', 'mysql', 'attribute']
['type', 'java', 'function', 'node', 'test', 'xml', 'performance', 'exception']
['view', 'change', 'text', 'io', 'best', 'way', 'android', 'swift']
['custom', 'table', 'spring', 'jquery', 'working', 'adding', 'content', 'header']
['php', 'html', 'ajax', 'line', 'form', 'script', 'command', 'jquery']
['array', 'string', 'column', 'python', 'list', 'one', 'row', 'loop']
['image', 'app', 'window', 'android', 'wpf', 'application', 'javascript', 'event']
['api', 'request', 'google', 'http', 'android', 'post', 'model', 'date']
['server', 'web', 'sql', 'button', 'service', 'different', 'application', 'window']
['variable', 'studio', 'visual', 'project', 'azure', 'set', 'used', 'many']


In [35]:
# afficher les probabilité des topics des documents
for i in range(2):
    print(f"Document {i+1} topic distribution:")
    for topic, prob in train_topics_distribution[i]:
        print(f"Topic {topic}: {prob}")
    print()

Document 1 topic distribution:
Topic 0: 0.033334117382764816
Topic 1: 0.033334117382764816
Topic 2: 0.033334117382764816
Topic 3: 0.03333425521850586
Topic 4: 0.033334117382764816
Topic 5: 0.6999927759170532
Topic 6: 0.033334117382764816
Topic 7: 0.033334117382764816
Topic 8: 0.033334117382764816
Topic 9: 0.033334143459796906

Document 2 topic distribution:
Topic 0: 0.22518278658390045
Topic 1: 0.020010432228446007
Topic 2: 0.020007874816656113
Topic 3: 0.020009726285934448
Topic 4: 0.020008238032460213
Topic 5: 0.020008906722068787
Topic 6: 0.37547746300697327
Topic 7: 0.2592694163322449
Topic 8: 0.02001725509762764
Topic 9: 0.020007874816656113



## 3.2 prédiction de topics et les mots

In [36]:
def predit_word(num_topics, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

In [37]:
def predit_word_full(num_topics, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 

    #print("test_bow_corpus = ", test_bow_corpus)
    #print("test_topics_distributions = ", test_topics_distributions)
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords, train_dictionary, lda_model

### 3.2.1 Prediction avec modèle 1 

In [38]:
num_topics = 20

In [39]:
test_keywords_m1 = predit_word(num_topics, X_train_title, X_test_title)

In [40]:
print(test_keywords_m1[:5])

[['uisplitviewcontroller', 'php', 'form', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'tomcat', 'bar'], ['show', 'form', 'navigation', 'view', 'bar'], ['view', 'uisplitviewcontroller', 'tomcat', 'navigation'], ['possible', 'color', 'fly', 'procedure', 'stored', 'netbeans', 'run', 'tomcat', 'bar', 'gray'], ['view', 'authentication', 'color', 'form']]


In [41]:
test_keywords_m1, train_dictionary, lda_model = predit_word_full(num_topics, X_train_title, X_test_title)

In [42]:
print(test_keywords_m1[:5])

[['stored', 'decryption', 'php', 'procedure', 'fly', 'netbeans', 'run', 'tomcat', 'bar', 'color'], ['navigation', 'php', 'ticket', 'uisplitviewcontroller', 'run'], ['php', 'form', 'ticket'], ['uisplitviewcontroller', 'color', 'fly', 'procedure', 'stored', 'netbeans', 'run', 'tomcat', 'bar', 'gray'], ['uisplitviewcontroller', 'bar', 'color']]


In [43]:
import pickle

with open('lda_model.pkl', 'wb') as model_file:
    pickle.dump(lda_model, model_file)
with open('train_dictionary.pkl', 'wb') as dictionary_file:
    pickle.dump(train_dictionary, dictionary_file)    

### 3.2.2 Prediction avec modèle 2 

In [44]:
num_topics = 30

In [45]:
test_keywords_m2 = predit_word(num_topics, X_train_title + X_train_body,  X_test_title + X_test_body)

In [46]:
print(test_keywords_m2[:5])

[['net', 'copy', 'directly', 'fly', 'sql', 'function', 'passing', 'chose'], ['execute', 'getting', 'calling', 'return', 'directly'], ['net', 'calling', 'eclipse', 'maven', 'clean', 'stored', 'advance'], ['maven', 'call', 'bam', 'eclipse', 'procedure', 'clean'], ['getting', 'sql', 'chose', 'go']]


### 3.2.2 Prediction avec modèle 3

In [47]:
def predit_word_tag(num_topics, train_dictionary, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    # train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

In [48]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [49]:
num_topics = 10

In [50]:
test_keywords_m3 = predit_word_tag(num_topics, train_dictionary_tag, X_train_title, X_test_title)

In [51]:
print(test_keywords_m3[:5])

[['abstract', 'aar', 'acceptance', 'abstraction', 'acceleration', 'accelerometer', 'abide', 'abi', 'abort', 'absolute'], ['abstraction', 'abstract', 'accelerometer', 'abort', 'acceleration', 'abi', 'absolute', 'abide', 'acceptance', 'aar'], ['absolute', 'accelerometer', 'abstraction', 'abstract', 'aar', 'abort', 'abi', 'acceptance', 'acceleration', 'abide'], ['abi', 'abide', 'abstraction', 'absolute', 'accelerometer', 'abort', 'aar', 'abstract', 'acceptance', 'acceleration'], ['aar', 'accelerometer', 'acceleration', 'abstraction', 'abstract', 'absolute', 'abi', 'abide', 'abort', 'acceptance']]


## 3.3 Mesure de prediction

In [52]:
def prediction_couverture(predits, reels): 

    # Convertir les colonnes des dataframes en listes de tokens
    mots_cles_reels = reels.tolist()  # convert token to list de strings
    mots_cles_predits = predits

    # Compteur de mots clés réels correctement prédits
    mots_cles_corrects = 0

    # Parcourir les mots clés réels
    for mots_reels in mots_cles_reels:  # pour chaque document
        for mot_reel in mots_reels:     # chaque mot reel du document
        # si un mot réel est parmi les mots prédits
            if any(mot_reel in mots_predits for mots_predits in mots_cles_predits):
                mots_cles_corrects += 1
                break

    # Calculer le taux de couverture des tags réels
    taux_couverture = mots_cles_corrects / len(mots_cles_reels) * 100

    # Afficher le taux de couverture des tags réels
    print("Taux de couverture des tags réels :", taux_couverture, "%")
    return taux_couverture

In [53]:
couverture_m1 = prediction_couverture(test_keywords_m1,X_test['Tag_token'])

Taux de couverture des tags réels : 10.93 %


In [54]:
couverture_m2 = prediction_couverture(test_keywords_m2,X_test['Tag_token'])

Taux de couverture des tags réels : 18.89 %


In [55]:
couverture_m3 = prediction_couverture(test_keywords_m3,X_test['Tag_token'])

Taux de couverture des tags réels : 0.2 %
