In [1]:
import pandas as pd
import numpy as np
import warnings 
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from gensim.corpora import Dictionary
from datetime import datetime

In [2]:
import pyLDAvis
import pyLDAvis.gensim_models

In [3]:
warnings.filterwarnings('ignore')

In [4]:
#data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip', index_col=[0])

In [5]:
data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip')

In [6]:
data.columns

Index(['Id', 'Title', 'Title_tokens', 'Body', 'Body_tokens', 'Tags',
       'Tag_token', 'Score', 'AnswerCount'],
      dtype='object')

In [7]:
data.shape

(50000, 9)

In [8]:
X = data[['Title_tokens', 'Body_tokens','Tag_token','Score', 'AnswerCount']]
y = data[['Tag_token']]

# re tokenisations des variables tockens suite à la lecture csv qui lit champe en string,  mise à jour aussi dans dataframe data. 
X['Title_tokens'] = X['Title_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Title_tokens'] = X['Title_tokens'].apply(nltk.word_tokenize)
X['Body_tokens'] = X['Body_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Body_tokens'] = X['Body_tokens'].apply(nltk.word_tokenize)
X['Tag_token'] = X['Tag_token'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Tag_token'] = X['Tag_token'].apply(nltk.word_tokenize)

# Split the Title_tokens into train and test sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

X_train_title = X_train['Title_tokens']
X_test_title = X_test['Title_tokens']
X_train_body = X_train['Body_tokens']
X_test_body = X_test['Body_tokens']
X_train_tag = X_train['Tag_token']
X_test_tag = X_test['Tag_token']

In [9]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, navigation, bar, uis...
16398       [authentication, ticket, decryption, possible]
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [10]:
X_test_tag.head()

33553         [javascript, jquery, html, css, tablesorter]
9427               [php, mysql, file, codeigniter, upload]
199      [python, python, x, csv, dictionary, multidime...
12447               [php, net, frameworks, ldap, openldap]
39489    [vue, js, vuejs, datepicker, vue, component, v...
Name: Tag_token, dtype: object

# 3. Modèles non supervisés

Nous utilisons LDA comme le modèle non supervisé.  LDA entrainé est utilisée pour découvrir des sujets cachés dans les nouveaux documents. 
Les mots les plus probables dans ces sujets découverts sont généralement considérés comme les mots-clés les plus pertinents pour le document.

In [11]:
performance_list = []
pyLDAvis.enable_notebook()

# 3.1 Entrainement et nombre de topics

In [12]:
def lda_train(num_topics, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

### 3.1.1 Modèle 1: LDA entrainement avec titre

In [13]:
num_topics = 10
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [14]:
num_topics = 15
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [15]:
num_topics = 20
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [16]:
num_topics = 30
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [17]:
num_topics = 50
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

### 3.1.2 Modèle 2: LDA entrainement avec titre + body

In [18]:
X_train_merged = X_train_title + X_train_body

In [19]:
X_train_merged.head()

39087    [stored, procedure, calling, stored, procedure...
30893    [run, fly, tomcat, netbeans, maven, web, proje...
45278    [show, gray, color, view, navigation, bar, uis...
16398    [authentication, ticket, decryption, possible,...
13653    [validation, yii, validation, yii, advanced, p...
dtype: object

In [20]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, navigation, bar, uis...
16398       [authentication, ticket, decryption, possible]
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [21]:
X_train_body.head()

39087    [calling, stored, procedure, execute, procedur...
30893    [maven, web, project, netbeans, eclipse, refer...
45278    [written, test, ipad, contains, split, view, u...
16398    [php, developer, almost, nothing, net, asked, ...
13653    [validation, yii, advanced, parent_id, creatin...
Name: Body_tokens, dtype: object

In [22]:
num_topics = 10
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
vis_merged

In [23]:
num_topics = 20
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [24]:
num_topics = 30
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [25]:
num_topics = 50
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

Nous sélectionnons le model avec 20 topics avec titre

### 3.1.3  Modèle 3: LDA entrainement avec titre & tag et body

In [26]:
def lda_train_tag(num_topics, dictionary, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    # dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

In [27]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [28]:
num_topics = 10
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [29]:
num_topics = 20
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [30]:
num_topics = 30
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [31]:
num_topics = 50
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [32]:
train_result = pd.DataFrame(performance_list, columns=['Date', 'Modele', 'nb topics', 'perplexity', 'coherence_score'])
train_result.to_csv('lda_train_result.csv')

### 3.1.4 LDA entrainement avec titre pour fit et titre+body pour transform

In Gensim, the topic modeling models such as LdaModel do not have separate fit and transform methods like in some other machine learning libraries. Instead, the training and transformation steps are combined into a single process.

### 3.1.5 Example des topics découverts 

In [33]:
num_topics = 10

In [34]:
# Create a dictionary and bag-of-words representation of the training data, modèle 1: 
train_dictionary = Dictionary(X_train_title)
train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train_title]

# Train LDA model with 20 topics
lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)

# topics distribution pour chaque docuement dans X_train_title
train_topics_distribution = lda_model.get_document_topics(train_bow_corpus)

In [35]:
# les 10 topics découverts avec les 8 top words
topics = lda_model.print_topics(num_words=8)
for topic in topics:
    words = topic[1].split('+')
    words = [word.split('*')[1].replace('"', '').strip() for word in words]
    print(words)

['issue', 'rail', 'module', 'import', 'performance', 'template', 'validation', 'side']
['column', 'show', 'row', 'property', 'another', 'null', 'panda', 'dataframe']
['loop', 'type', 'select', 'specific', 'return', 'parameter', 'plugin', 'used']
['date', 'django', 'convert', 'test', 'output', 'trying', 'insert', 'project']
['variable', 'getting', 'memory', 'read', 'http', 'line', 'found', 'find']
['different', 'spring', 'query', 'studio', 'visual', 'core', 'single', 'two']
['view', 'call', 'framework', 'mvc', 'react', 'system', 'controller', 'default']
['service', 'inside', 'post', 'request', 'map', 'send', 'model', 'azure']
['button', 'display', 'xml', 'click', 'element', 'event', 'attribute', 'remove']
['url', 'ajax', 'best', 'load', 'selenium', 'folder', 'token', 'mobile']


In [36]:
# afficher les probailité des topics des documents
for i in range(2):
    print(f"Document {i+1} topic distribution:")
    for topic, prob in train_topics_distribution[i]:
        print(f"Topic {topic}: {prob}")
    print()

Document 1 topic distribution:
Topic 0: 0.033333923667669296
Topic 1: 0.6999945640563965
Topic 2: 0.033333923667669296
Topic 3: 0.03333396837115288
Topic 4: 0.033333923667669296
Topic 5: 0.03333395719528198
Topic 6: 0.033333923667669296
Topic 7: 0.033333923667669296
Topic 8: 0.033333923667669296
Topic 9: 0.033333923667669296

Document 2 topic distribution:
Topic 0: 0.02000763826072216
Topic 1: 0.2802756726741791
Topic 2: 0.020004427060484886
Topic 3: 0.5596908330917358
Topic 4: 0.020003993064165115
Topic 5: 0.020003359764814377
Topic 6: 0.020003365352749825
Topic 7: 0.02000381238758564
Topic 8: 0.020003458485007286
Topic 9: 0.020003441721200943



## 3.2 prédiction de topics et les mots

In [37]:
def predit_word(num_topics, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

### 3.2.1 Prediction avec modèle 1 

In [38]:
num_topics = 20

In [39]:
test_keywords_m1 = predit_word(num_topics, X_train_title, X_test_title)

In [40]:
print(test_keywords_m1[:5])

[['authentication', 'stored', 'procedure', 'fly', 'netbeans', 'run', 'tomcat', 'bar', 'color', 'gray'], ['authentication', 'navigation', 'tomcat', 'possible', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'bar'], ['authentication', 'decryption', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'tomcat', 'bar', 'color'], ['show', 'certain', 'uisplitviewcontroller', 'decryption', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'tomcat'], ['authentication', 'procedure', 'navigation', 'yii', 'stored', 'fly', 'netbeans', 'run', 'tomcat', 'bar']]


### 3.2.2 Prediction avec modèle 2 

In [41]:
num_topics = 30

In [42]:
test_keywords_m2 = predit_word(num_topics, X_train_title + X_train_body,  X_test_title + X_test_body)

In [43]:
print(test_keywords_m2[:5])

[['net', 'project', 'parameter', 'goal', 'return', 'stored', 'sql'], ['stored', 'eclipse', 'correct', 'project', 'parameter', 'go', 'chose', 'passing'], ['eclipse', 'directly', 'project', 'although', 'passing', 'copy', 'fine'], ['sql', 'procedure', 'maven', 'although', 'project', 'parameter', 'correct', 'calling'], ['go', 'parameter', 'possibility']]


### 3.2.2 Prediction avec modèle 3

In [44]:
def predit_word_tag(num_topics, train_dictionary, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    # train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

In [45]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [46]:
num_topics = 10

In [47]:
test_keywords_m3 = predit_word_tag(num_topics, train_dictionary_tag, X_train_title, X_test_title)

In [48]:
print(test_keywords_m3[:5])

[['aar', 'acceleration', 'abstraction', 'aac', 'absolute', 'accelerometer', 'abstract', 'abi', 'abide', 'abort'], ['aac', 'acceleration', 'aar', 'accelerometer', 'abi', 'abide', 'abstraction', 'abort', 'absolute', 'abstract'], ['aac', 'abstraction', 'absolute', 'accelerometer', 'aar', 'abort', 'acceleration', 'abi', 'abide', 'abstract'], ['acceleration', 'abi', 'abstract', 'abort', 'aac', 'abide', 'absolute', 'abstraction', 'accelerometer', 'aar'], ['abstraction', 'abort', 'absolute', 'accelerometer', 'aac', 'acceleration', 'aar', 'abi', 'abstract', 'abide']]


## 3.3 Mesure de prediction

In [49]:
def prediction_couverture(predits, reels): 

    # Convertir les colonnes des dataframes en listes de tokens
    mots_cles_reels = reels.tolist()  # convert token to list de strings
    mots_cles_predits = predits

    # Compteur de mots clés réels correctement prédits
    mots_cles_corrects = 0

    # Parcourir les mots clés réels
    for mots_reels in mots_cles_reels:  # pour chaque document
        for mot_reel in mots_reels:     # chaque mot reel du document
        # si un mot réel est parmi les mots prédits
            if any(mot_reel in mots_predits for mots_predits in mots_cles_predits):
                mots_cles_corrects += 1
                break

    # Calculer le taux de couverture des tags réels
    taux_couverture = mots_cles_corrects / len(mots_cles_reels) * 100

    # Afficher le taux de couverture des tags réels
    print("Taux de couverture des tags réels :", taux_couverture, "%")
    return taux_couverture

In [496]:
couverture_m1 = prediction_couverture(test_keywords_m1,X_test['Tag_token'])

Taux de couverture des tags réels : 4.04 %


In [497]:
couverture_m2 = prediction_couverture(test_keywords_m2,X_test['Tag_token'])

Taux de couverture des tags réels : 18.240000000000002 %


In [526]:
couverture_m3 = prediction_couverture(test_keywords_m3,X_test['Tag_token'])

Taux de couverture des tags réels : 0.2 %


## 3.1 Détermination de nombre optimal de sujets

LDA est utilisée pour découvrir des sujets cachés dans les documents. Chaque document est une distribution de sujets et chaque sujet est une distribution de mots. Si le nombre de sujets est trop élevé, le modèle peut surajuster les données d'apprentissage, Overffiting et créer des sujets non informatiques.   Si le nombre de sujets est trop faible, il y a perte d'information,  des sujets trop généraux et manque de différenciation. D'où la nécessité de cherche un nombre optimal. 

In [143]:
X_train_title = X_train['Title_tokens']
X_test_title = X_test['Title_tokens']
X_train_body = X_train['Body_tokens']
X_test_body = X_test['Body_tokens']

In [144]:
def find_topics_number(X_train):
    # Préparer le dictionnaire
    dictionary = Dictionary(X_train)

    # Convertir le corpus en une représentation vectorielle (sac de mots)
    train_bow = [dictionary.doc2bow(doc) for doc in X_train]

    # Diviser le corpus en ensembles d'apprentissage et de validation
    
    train_corpus, val_corpus = train_test_split(train_bow, test_size=0.2, random_state=42)
    print(' train bow size =', len(train_corpus), 'val size =', len(val_corpus))
    
    # Liste pour stocker les perplexités
    perplexities = []

    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 30, 40, 50, 100, 200]

    # Entraîner les modèles LDA et calculer les perplexités
    for num_topics in num_topics_list:
        lda_model = LdaModel(train_corpus, num_topics=num_topics, id2word=dictionary)
#        perplexity = lda_model.log_perplexity(val_corpus)
        perplexity = lda_model.log_perplexity(train_corpus)
        perplexities.append(perplexity)
        print(f"Nombre de sujets : {num_topics}, Perplexité : {perplexity}")

    # Trouver le nombre optimal de sujets avec la perplexité la plus faible
    optimal_num_topics = num_topics_list[perplexities.index(min(perplexities))]
    return optimal_num_topics


In [145]:
num_topics_title = find_topics_number(X_train_title)

 train bow size = 32000 val size = 8000
Nombre de sujets : 5, Perplexité : -8.160909831711681
Nombre de sujets : 10, Perplexité : -8.685802974307766
Nombre de sujets : 20, Perplexité : -11.356819447490578
Nombre de sujets : 30, Perplexité : -12.996426544618757
Nombre de sujets : 40, Perplexité : -14.840559456344964
Nombre de sujets : 50, Perplexité : -17.216560859801607
Nombre de sujets : 100, Perplexité : -60.47661051331654
Nombre de sujets : 200, Perplexité : -398.8067932128906


In [146]:
def find_optimal_num_topics(X_train):
    # Préparer le dictionnaire
    dictionary = Dictionary(X_train)

    # Convertir le corpus en une représentation vectorielle (sac de mots)
    train_bow = [dictionary.doc2bow(doc) for doc in X_train]    
    
    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 30, 40, 50, 100, 200]
    
    perplexities = []
    coherence_scores = []

    for num_topics in num_topics_list:
        # Entraîner le modèle LDA
        lda_model = LdaModel(train_bow, num_topics=num_topics, id2word=dictionary)

        # Calculer la perplexité
        perplexity = lda_model.log_perplexity(train_bow)
        perplexities.append(perplexity)

        # Calculer la cohérence des topics
        coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=train_bow, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores.append(coherence_score)

        print(f"Nombre de sujets : {num_topics}, Perplexité : {perplexity}, Cohérence : {coherence_score}")

    # Trouver le nombre optimal de sujets en utilisant une approche combinée
    combined_scores = [p + c for p, c in zip(perplexities, coherence_scores)]
    optimal_num_topics = num_topics_list[combined_scores.index(min(combined_scores))]

    return optimal_num_topics


In [147]:
# Utilisation de la fonction find_optimal_num_topics pour trouver le nombre optimal de sujets
optimal_num_topics = find_optimal_num_topics(X_train_title)
optimal_num_topics


Nombre de sujets : 5, Perplexité : -8.147663929077003, Cohérence : 0.33433522067570903
Nombre de sujets : 10, Perplexité : -8.645684040670103, Cohérence : 0.3847911297048994
Nombre de sujets : 20, Perplexité : -11.377812485437072, Cohérence : 0.4580272427163691
Nombre de sujets : 30, Perplexité : -13.040845536825808, Cohérence : 0.527198120661385
Nombre de sujets : 40, Perplexité : -14.8449386442342, Cohérence : 0.575516915328435
Nombre de sujets : 50, Perplexité : -17.286233070830484, Cohérence : 0.5953587928565621
Nombre de sujets : 100, Perplexité : -56.353362749362816, Cohérence : 0.4734337966651315
Nombre de sujets : 200, Perplexité : -384.4603310254296, Cohérence : 0.7985812298104998


200

In [825]:
print(X_train_title[:100])

39087                                  [stored, procedure]
30893                              [fly, tomcat, netbeans]
45278                 [show, gray, color, navigation, bar]
16398       [authentication, ticket, decryption, possible]
13653                                    [validation, yii]
                               ...                        
49622                            [drawing, uiview, iphone]
41140    [program, mouseover, control, divs, larger, de...
11416                                           [backbone]
35988                               [implementation, ruby]
498                        [hide, taskbar, taking, screen]
Name: Title_tokens, Length: 100, dtype: object


In [842]:
from sklearn.model_selection import KFold

def find_optimal_num_topics_cross_validation(X_train, num_folds=5):
    
    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 40, 100]
    
    # Diviser le corpus en K plis
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    perplexities = []
    coherence_scores = []
    
    for num_topics in num_topics_list:
        fold_perplexities = []
        fold_coherence_scores = []
        
        for train_index, val_index in kf.split(X_train):
        # Diviser le corpus en ensembles d'apprentissage et de validation
            X_train_fold = X_train.iloc[train_index].values.tolist()
            X_val_fold = X_train.iloc[val_index].values.tolist()
            
            # Préparer le dictionnaire
            dictionary = Dictionary(X_train_fold)
            
            # Convertir le corpus en une représentation vectorielle (sac de mots)
            train_bow = [dictionary.doc2bow(doc) for doc in X_train_fold]
            val_bow = [dictionary.doc2bow(doc) for doc in X_val_fold]
            
            # Entraîner le modèle LDA
            lda_model = LdaModel(train_bow, num_topics=num_topics, id2word=dictionary)
            
            # Calculer la perplexité
            perplexity = lda_model.log_perplexity(val_bow)
            fold_perplexities.append(perplexity)
            
            # Calculer la cohérence des topics
            coherence_model = CoherenceModel(model=lda_model, texts=X_train_fold, corpus=train_bow, coherence='c_v')
            coherence_score = coherence_model.get_coherence()
            fold_coherence_scores.append(coherence_score)
        
        # Calculer les moyennes des perplexités et des scores de cohérence sur les plis
        mean_perplexity = sum(fold_perplexities) / num_folds
        mean_coherence = sum(fold_coherence_scores) / num_folds
        
        perplexities.append(mean_perplexity)
        coherence_scores.append(mean_coherence)
        
        print(f"Nombre de sujets : {num_topics}, Perplexité moyenne : {mean_perplexity}, Cohérence moyenne : {mean_coherence}")
    
    # Trouver le nombre optimal de sujets en utilisant une approche combinée
    combined_scores = [p + c for p, c in zip(perplexities, coherence_scores)]
    optimal_num_topics = num_topics_list[combined_scores.index(min(combined_scores))]
    
    return optimal_num_topics


In [843]:
optimal_num_topics = find_optimal_num_topics_cross_validation(X_train_title, 5)
print(f"Nombre optimal de sujets : {optimal_num_topics}")

Nombre de sujets : 5, Perplexité moyenne : -8.826589198606827, Cohérence moyenne : 0.5111803945771971
Nombre de sujets : 10, Perplexité moyenne : -9.402572921184712, Cohérence moyenne : 0.5300611538266922
Nombre de sujets : 20, Perplexité moyenne : -12.275141462642813, Cohérence moyenne : 0.5639125076950797
Nombre de sujets : 40, Perplexité moyenne : -16.36354027403153, Cohérence moyenne : 0.6273990263591562
Nombre de sujets : 100, Perplexité moyenne : -79.27656749624217, Cohérence moyenne : 0.5741283692792261
Nombre optimal de sujets : 100


In [None]:
# Step 5: Evaluation

# Initialize variables to keep track of evaluation metrics
true_positives = 0
false_positives = 0
false_negatives = 0

# Iterate through each document in the test set
for i in range(len(test_documents)):
    # Get the extracted keywords for the current document
    extracted_keywords = top_keywords_per_document[i]

    # Get the ground truth keywords for the current document
    ground_truth_keywords = ground_truth_keywords_per_document[i]

    # Calculate the number of true positives, false positives, and false negatives
    for keyword in extracted_keywords:
        if keyword in ground_truth_keywords:
            true_positives += 1
        else:
            false_positives += 1

    for keyword in ground_truth_keywords:
        if keyword not in extracted_keywords:
            false_negatives += 1

# Calculate evaluation metrics
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = (true_positives + true_negatives) / total_documents

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)
print("Accuracy:", accuracy)


## 3.2 NMF + TF-IDF