In [1]:
import pandas as pd
import numpy as np
import warnings 
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
from gensim.corpora import Dictionary
from datetime import datetime

In [2]:
import pyLDAvis
import pyLDAvis.gensim_models

In [3]:
warnings.filterwarnings('ignore')

In [4]:
#data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip', index_col=[0])

In [5]:
data =  pd.read_csv('data_processed.csv',  engine = 'python',  on_bad_lines = 'skip')

In [6]:
data.columns

Index(['Id', 'Title', 'Title_tokens', 'Body', 'Body_tokens', 'Tags',
       'Tag_token', 'Score', 'AnswerCount'],
      dtype='object')

In [7]:
data.shape

(50000, 9)

In [8]:
X = data[['Title_tokens', 'Body_tokens','Tag_token','Score', 'AnswerCount']]
y = data[['Tag_token']]

# re tokenisations des variables tockens suite à la lecture csv qui lit champe en string,  mise à jour aussi dans dataframe data. 
X['Title_tokens'] = X['Title_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Title_tokens'] = X['Title_tokens'].apply(nltk.word_tokenize)
X['Body_tokens'] = X['Body_tokens'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Body_tokens'] = X['Body_tokens'].apply(nltk.word_tokenize)
X['Tag_token'] = X['Tag_token'].apply(lambda x: re.sub('[^a-zA-Z_]', ' ', str(x)))
X['Tag_token'] = X['Tag_token'].apply(nltk.word_tokenize)

# Split the Title_tokens into train and test sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

X_train_title = X_train['Title_tokens']
X_test_title = X_test['Title_tokens']
X_train_body = X_train['Body_tokens']
X_test_body = X_test['Body_tokens']
X_train_tag = X_train['Tag_token']
X_test_tag = X_test['Tag_token']

In [9]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [14]:
X_test_tag.head()

33553         [javascript, jquery, html, css, tablesorter]
9427               [php, mysql, file, codeigniter, upload]
199      [python, python, x, csv, dictionary, multidime...
12447               [php, net, frameworks, ldap, openldap]
39489    [vue, js, vuejs, datepicker, vue, component, v...
Name: Tag_token, dtype: object

# 3. Modèles non supervisés

Nous utilisons LDA comme le modèle non supervisé.  LDA entrainé est utilisée pour découvrir des sujets cachés dans les nouveaux documents. 
Les mots les plus probables dans ces sujets découverts sont généralement considérés comme les mots-clés les plus pertinents pour le document.

In [15]:
performance_list = []
pyLDAvis.enable_notebook()

# 3.1 Entrainement et nombre de topics

In [16]:
def lda_train(num_topics, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

### 3.1.1 Modèle 1: LDA entrainement avec titre

In [17]:
num_topics = 10
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [18]:
num_topics = 20
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [19]:
num_topics = 30
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

In [20]:
num_topics = 50
perplexity_titre, coherence_score_titre, lda_titre, vis_titre = lda_train(num_topics, X_train_title)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre', num_topics, perplexity_titre, coherence_score_titre]
performance_list.append(y_resultat)
vis_titre

### 3.1.2 Modèle 2: LDA entrainement avec titre + body

In [21]:
X_train_merged = X_train_title + X_train_body

In [22]:
X_train_merged.head()

39087    [stored, procedure, function, calling, stored,...
30893    [run, fly, tomcat, netbeans, maven, web, proje...
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653    [validation, yii, validation, yii, advanced, p...
dtype: object

In [23]:
X_train_title.head()

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653                                    [validation, yii]
Name: Title_tokens, dtype: object

In [24]:
X_train_body.head()

39087    [function, calling, stored, procedure, execute...
30893    [maven, web, project, netbeans, eclipse, refer...
45278    [written, test, ipad, app, contains, split, vi...
16398    [php, developer, almost, nothing, net, asked, ...
13653    [validation, yii, advanced, parent_id, creatin...
Name: Body_tokens, dtype: object

In [25]:
num_topics = 10
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
vis_merged

In [26]:
num_topics = 20
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [27]:
num_topics = 30
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

In [25]:
num_topics = 50
perplexity_merged, coherence_score_merged, lda_merged, vis_merged = lda_train(num_topics, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre et body', num_topics, perplexity_merged, coherence_score_merged]
performance_list.append(y_resultat)
pyLDAvis.display(vis_merged)

Nous sélectionnons le model avec 20 topics avec titre

### 3.1.3  Modèle 3: LDA entrainement avec titre & tag et body

In [26]:
def lda_train_tag(num_topics, dictionary, X_train):
    # Create a dictionary and bag-of-words representation of the training data
    # dictionary = Dictionary(X_train)
    bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary)

    topics = lda_model.get_document_topics(bow_corpus)
    
    #  Visulisation des topics 
#    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='mmds', R=30)
    vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary, mds='pcoa', sort_topics=True)
    
    # Calculer la perplexité
    perplexity = lda_model.log_perplexity(bow_corpus)

    # Calculer la cohérence des topics
    coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=bow_corpus, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    return perplexity, coherence_score, lda_model, vis

In [27]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [28]:
num_topics = 10
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [29]:
num_topics = 20
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [30]:
num_topics = 30
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [31]:
num_topics = 50
perplexity_tag, coherence_score_tag, lda_tag, vis_tag = lda_train_tag(num_topics, train_dictionary_tag, X_train_merged)
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
y_resultat = [formatted_datetime, 'lda train avec titre&tag + body', num_topics, perplexity_tag, coherence_score_tag]
performance_list.append(y_resultat)
pyLDAvis.display(vis_tag)

In [32]:
train_result = pd.DataFrame(performance_list, columns=['Date', 'Modele', 'nb topics', 'perplexity', 'coherence_score'])
train_result.to_csv('lda_train_result.csv')

### 3.1.4 LDA entrainement avec titre pour fit et titre+body pour transform

In Gensim, the topic modeling models such as LdaModel do not have separate fit and transform methods like in some other machine learning libraries. Instead, the training and transformation steps are combined into a single process.

### 3.1.5 Example des topics découverts 

In [33]:
num_topics = 10

In [34]:
# Create a dictionary and bag-of-words representation of the training data, modèle 1: 
train_dictionary = Dictionary(X_train_title)
train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train_title]

# Train LDA model with 20 topics
lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)

# topics distribution pour chaque docuement dans X_train_title
train_topics_distribution = lda_model.get_document_topics(train_bow_corpus)

In [35]:
# les 10 topics découverts avec les 8 top words
topics = lda_model.print_topics(num_words=8)
for topic in topics:
    words = topic[1].split('+')
    words = [word.split('*')[1].replace('"', '').strip() for word in words]
    print(words)

['list', 'button', 'sql', 'way', 'studio', 'column', 'best', 'visual']
['asp', 'web', 'service', 'code', 'server', 'mvc', 'call', 'python']
['request', 'post', 'form', 'javascript', 'php', 'http', 'image', 'html']
['google', 'application', 'map', 'system', 'java', 'api', 'instance', 'window']
['text', 'view', 'return', 'input', 'field', 'spring', 'item', 'change']
['issue', 'property', 'java', 'xml', 'work', 'module', 'key', 'node']
['multiple', 'table', 'mysql', 'query', 'problem', 'database', 'one', 'sql']
['android', 'app', 'io', 'window', 'command', 'azure', 'line', 'exception']
['array', 'rail', 'jquery', 'time', 'date', 'string', 'model', 'python']
['object', 'function', 'class', 'json', 'array', 'parameter', 'custom', 'template']


In [36]:
# afficher les probabilité des topics des documents
for i in range(2):
    print(f"Document {i+1} topic distribution:")
    for topic, prob in train_topics_distribution[i]:
        print(f"Topic {topic}: {prob}")
    print()

Document 1 topic distribution:
Topic 0: 0.6999945640563965
Topic 1: 0.03333393484354019
Topic 2: 0.03333393484354019
Topic 3: 0.03333393484354019
Topic 4: 0.03333393484354019
Topic 5: 0.03333393484354019
Topic 6: 0.03333393484354019
Topic 7: 0.03333393484354019
Topic 8: 0.03333393484354019
Topic 9: 0.03333393856883049

Document 2 topic distribution:
Topic 0: 0.02000456117093563
Topic 1: 0.020007535815238953
Topic 2: 0.21997153759002686
Topic 3: 0.020004501566290855
Topic 4: 0.020006481558084488
Topic 5: 0.020004497841000557
Topic 6: 0.21975894272327423
Topic 7: 0.4202328324317932
Topic 8: 0.02000465989112854
Topic 9: 0.020004497841000557



## 3.2 prédiction de topics et les mots

In [29]:
def predit_word(num_topics, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

In [39]:
def predit_word_full(num_topics, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 

    print("test_bow_corpus = ", test_bow_corpus)
    print("test_topics_distributions = ", test_topics_distributions)
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords, train_dictionary, lda_model

### 3.2.1 Prediction avec modèle 1 

In [30]:
num_topics = 20

In [31]:
test_keywords_m1 = predit_word(num_topics, X_train_title, X_test_title)

In [32]:
print(test_keywords_m1[:5])

[['tomcat', 'decryption', 'fly', 'procedure', 'stored', 'netbeans', 'run', 'bar', 'color', 'gray'], ['bar', 'gray', 'fly', 'show', 'possible', 'tomcat'], ['authentication', 'decryption', 'show', 'netbeans'], ['authentication', 'possible', 'navigation', 'decryption', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'tomcat'], ['run', 'uisplitviewcontroller', 'form', 'ticket']]


In [40]:
test_keywords_m1, train_dictionary, lda_model = predit_word_full(num_topics, X_train_title, X_test_title)

test_bow_corpus =  [[(33, 1), (195, 1), (797, 1), (3332, 1), (6625, 1)], [(16, 1), (117, 2), (328, 1), (561, 1), (1694, 1), (1851, 1), (2216, 1)], [(27, 1), (38, 1), (90, 1), (481, 1), (794, 1), (1363, 1)], [(181, 1), (470, 1), (807, 1), (839, 1), (3020, 1)], [(110, 1), (282, 1), (283, 1), (337, 1), (349, 1), (3196, 1)], [(1098, 1), (3857, 1), (4449, 1)], [(43, 1), (434, 1), (1037, 1), (1993, 1), (2268, 1), (2998, 1)], [(30, 1), (201, 2), (203, 2), (360, 1), (482, 1)], [(9, 1), (10, 1), (92, 1), (624, 1), (2635, 1)], [(217, 1), (1117, 1), (1337, 1), (2795, 1), (2913, 1), (2914, 2), (4294, 1)], [(117, 1), (119, 1), (503, 1), (1076, 1), (2535, 1)], [(5, 1), (130, 1), (379, 1), (729, 1)], [(234, 1), (741, 1), (2165, 1), (6406, 1)], [(78, 1), (223, 1), (811, 1), (998, 1), (3645, 1), (4019, 1), (4532, 1)], [(92, 1), (222, 1), (5909, 1)], [(212, 2), (515, 1), (1086, 1)], [(68, 2), (79, 1), (168, 1), (256, 1), (2312, 1), (3698, 1), (5297, 1)], [(38, 1), (217, 1), (948, 1), (1079, 1), (1168, 1

In [34]:
print(test_keywords_m1[:5])

[['uisplitviewcontroller', 'form', 'io', 'procedure', 'stored', 'fly', 'netbeans', 'run', 'tomcat', 'bar'], ['netbeans', 'ticket', 'form', 'stored', 'color'], ['form', 'procedure', 'possible'], ['procedure', 'navigation', 'view', 'form', 'stored', 'fly', 'netbeans', 'run', 'tomcat', 'bar'], ['netbeans', 'navigation', 'possible', 'show']]


In [35]:
import pickle

with open('lda_model.pkl', 'wb') as model_file:
    pickle.dump(lda_model, model_file)
with open('train_dictionary.pkl', 'wb') as dictionary_file:
    pickle.dump(train_dictionary, dictionary_file)    

### 3.2.2 Prediction avec modèle 2 

In [41]:
num_topics = 30

In [42]:
test_keywords_m2 = predit_word(num_topics, X_train_title + X_train_body,  X_test_title + X_test_body)

In [43]:
print(test_keywords_m2[:5])

[['eclipse', 'procedure', 'bam', 'maven', 'correct'], ['execute', 'copy', 'chose', 'call'], ['eclipse', 'fine', 'bam', 'correct', 'clean'], ['netbeans', 'fly', 'package', 'goal', 'chose', 'calling', 'function'], ['parameter', 'eclipse', 'return', 'maven', 'goal']]


### 3.2.2 Prediction avec modèle 3

In [44]:
def predit_word_tag(num_topics, train_dictionary, X_train, X_test):
    # Prédict la topics distribution for each document in the test set X_test_title
    # train_dictionary = Dictionary(X_train)
    train_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_train]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=train_bow_corpus, num_topics=num_topics, id2word=train_dictionary)
    
    # création bow pour X_test_title avec les dictionary train  (avec test_body et option)    
    test_bow_corpus = [train_dictionary.doc2bow(doc) for doc in X_test]
    test_topics_distributions = lda_model[test_bow_corpus] 
    
    # Extract keywords from the inferred topic distributions
    test_keywords = []
    for doc_topics in test_topics_distributions:       
    # trier les topics par la probailité en order décroissant (x[1] de doc_topics)
        sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # extraire le premier mot clé des top topics 
        top_keywords = [train_dictionary[word_id] for word_id, _ in sorted_topics[:10]]  # Adjust the number of keywords as needed
        test_keywords.append(top_keywords)
    return test_keywords

In [45]:
# Create a set of tokens from X_train_title and X_train_tag
title_tokens_set = set([token for tokens in X_train_title for token in tokens])
tag_tokens_set = set([token for tokens in X_train_tag for token in tokens])

# Get the common tokens that appear in both sets
X_train_union = title_tokens_set.intersection(tag_tokens_set)

train_dictionary_tag = Dictionary([list(X_train_union)])

In [46]:
num_topics = 10

In [47]:
test_keywords_m3 = predit_word_tag(num_topics, train_dictionary_tag, X_train_title, X_test_title)

In [48]:
print(test_keywords_m3[:5])

[['acceptance', 'abort', 'accelerometer', 'abstract', 'acceleration', 'abstraction', 'absolute', 'aar', 'abi', 'abide'], ['abstraction', 'abide', 'aar', 'absolute', 'acceleration', 'abi', 'acceptance', 'abort', 'accelerometer', 'abstract'], ['abstraction', 'abi', 'abort', 'acceleration', 'accelerometer', 'absolute', 'aar', 'abide', 'acceptance', 'abstract'], ['aar', 'accelerometer', 'absolute', 'abstract', 'abort', 'abide', 'abi', 'acceleration', 'abstraction', 'acceptance'], ['abort', 'absolute', 'accelerometer', 'acceleration', 'abstraction', 'aar', 'abi', 'acceptance', 'abide', 'abstract']]


## 3.3 Mesure de prediction

In [49]:
def prediction_couverture(predits, reels): 

    # Convertir les colonnes des dataframes en listes de tokens
    mots_cles_reels = reels.tolist()  # convert token to list de strings
    mots_cles_predits = predits

    # Compteur de mots clés réels correctement prédits
    mots_cles_corrects = 0

    # Parcourir les mots clés réels
    for mots_reels in mots_cles_reels:  # pour chaque document
        for mot_reel in mots_reels:     # chaque mot reel du document
        # si un mot réel est parmi les mots prédits
            if any(mot_reel in mots_predits for mots_predits in mots_cles_predits):
                mots_cles_corrects += 1
                break

    # Calculer le taux de couverture des tags réels
    taux_couverture = mots_cles_corrects / len(mots_cles_reels) * 100

    # Afficher le taux de couverture des tags réels
    print("Taux de couverture des tags réels :", taux_couverture, "%")
    return taux_couverture

In [50]:
couverture_m1 = prediction_couverture(test_keywords_m1,X_test['Tag_token'])

Taux de couverture des tags réels : 10.93 %


In [51]:
couverture_m2 = prediction_couverture(test_keywords_m2,X_test['Tag_token'])

Taux de couverture des tags réels : 18.89 %


In [52]:
couverture_m3 = prediction_couverture(test_keywords_m3,X_test['Tag_token'])

Taux de couverture des tags réels : 0.2 %


## 3.1 Détermination de nombre optimal de sujets

LDA est utilisée pour découvrir des sujets cachés dans les documents. Chaque document est une distribution de sujets et chaque sujet est une distribution de mots. Si le nombre de sujets est trop élevé, le modèle peut surajuster les données d'apprentissage, Overffiting et créer des sujets non informatiques.   Si le nombre de sujets est trop faible, il y a perte d'information,  des sujets trop généraux et manque de différenciation. D'où la nécessité de cherche un nombre optimal. 

In [53]:
X_train_title = X_train['Title_tokens']
X_test_title = X_test['Title_tokens']
X_train_body = X_train['Body_tokens']
X_test_body = X_test['Body_tokens']

In [54]:
def find_topics_number(X_train):
    # Préparer le dictionnaire
    dictionary = Dictionary(X_train)

    # Convertir le corpus en une représentation vectorielle (sac de mots)
    train_bow = [dictionary.doc2bow(doc) for doc in X_train]

    # Diviser le corpus en ensembles d'apprentissage et de validation
    
    train_corpus, val_corpus = train_test_split(train_bow, test_size=0.2, random_state=42)
    print(' train bow size =', len(train_corpus), 'val size =', len(val_corpus))
    
    # Liste pour stocker les perplexités
    perplexities = []

    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 30, 40, 50, 100, 200]

    # Entraîner les modèles LDA et calculer les perplexités
    for num_topics in num_topics_list:
        lda_model = LdaModel(train_corpus, num_topics=num_topics, id2word=dictionary)
#        perplexity = lda_model.log_perplexity(val_corpus)
        perplexity = lda_model.log_perplexity(train_corpus)
        perplexities.append(perplexity)
        print(f"Nombre de sujets : {num_topics}, Perplexité : {perplexity}")

    # Trouver le nombre optimal de sujets avec la perplexité la plus faible
    optimal_num_topics = num_topics_list[perplexities.index(min(perplexities))]
    return optimal_num_topics


In [55]:
num_topics_title = find_topics_number(X_train_title)

 train bow size = 32000 val size = 8000
Nombre de sujets : 5, Perplexité : -7.927430745197398
Nombre de sujets : 10, Perplexité : -8.376244339481712
Nombre de sujets : 20, Perplexité : -10.573125196522561
Nombre de sujets : 30, Perplexité : -11.981862701781738
Nombre de sujets : 40, Perplexité : -13.43488126545079
Nombre de sujets : 50, Perplexité : -15.09810811794373
Nombre de sujets : 100, Perplexité : -38.76952866231251
Nombre de sujets : 200, Perplexité : -372.3030344876525


In [56]:
def find_optimal_num_topics(X_train):
    # Préparer le dictionnaire
    dictionary = Dictionary(X_train)

    # Convertir le corpus en une représentation vectorielle (sac de mots)
    train_bow = [dictionary.doc2bow(doc) for doc in X_train]    
    
    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 30, 40, 50, 100, 200]
    
    perplexities = []
    coherence_scores = []

    for num_topics in num_topics_list:
        # Entraîner le modèle LDA
        lda_model = LdaModel(train_bow, num_topics=num_topics, id2word=dictionary)

        # Calculer la perplexité
        perplexity = lda_model.log_perplexity(train_bow)
        perplexities.append(perplexity)

        # Calculer la cohérence des topics
        coherence_model = CoherenceModel(model=lda_model, texts=X_train, corpus=train_bow, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores.append(coherence_score)

        print(f"Nombre de sujets : {num_topics}, Perplexité : {perplexity}, Cohérence : {coherence_score}")

    # Trouver le nombre optimal de sujets en utilisant une approche combinée
    combined_scores = [p + c for p, c in zip(perplexities, coherence_scores)]
    optimal_num_topics = num_topics_list[combined_scores.index(min(combined_scores))]

    return optimal_num_topics


In [57]:
# Utilisation de la fonction find_optimal_num_topics pour trouver le nombre optimal de sujets
optimal_num_topics = find_optimal_num_topics(X_train_title)
optimal_num_topics


Nombre de sujets : 5, Perplexité : -7.887647310672316, Cohérence : 0.28513667662846603
Nombre de sujets : 10, Perplexité : -8.361153760059407, Cohérence : 0.2781299626421439
Nombre de sujets : 20, Perplexité : -10.616548925891566, Cohérence : 0.3706817533711183
Nombre de sujets : 30, Perplexité : -12.004418712755832, Cohérence : 0.4456469195320914
Nombre de sujets : 40, Perplexité : -13.422113144835173, Cohérence : 0.5078082937315591
Nombre de sujets : 50, Perplexité : -15.188640443257672, Cohérence : 0.5495342152091908
Nombre de sujets : 100, Perplexité : -38.36035710862713, Cohérence : 0.41249471044890185
Nombre de sujets : 200, Perplexité : -321.82890096985983, Cohérence : 0.6932315930344803


200

In [58]:
print(X_train_title[:100])

39087                                  [stored, procedure]
30893                         [run, fly, tomcat, netbeans]
45278    [show, gray, color, view, io, navigation, bar,...
16398    [form, authentication, ticket, decryption, pos...
13653                                    [validation, yii]
                               ...                        
49622             [drawing, incrementally, uiview, iphone]
41140    [way, program, mouseover, image, control, divs...
11416                      [java, getting, post, backbone]
35988                               [implementation, ruby]
498      [hide, window, taskbar, taking, screen, shot, ...
Name: Title_tokens, Length: 100, dtype: object


In [59]:
from sklearn.model_selection import KFold

def find_optimal_num_topics_cross_validation(X_train, num_folds=5):
    
    # Nombre de sujets à évaluer
    num_topics_list = [5, 10, 20, 40, 100]
    
    # Diviser le corpus en K plis
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    perplexities = []
    coherence_scores = []
    
    for num_topics in num_topics_list:
        fold_perplexities = []
        fold_coherence_scores = []
        
        for train_index, val_index in kf.split(X_train):
        # Diviser le corpus en ensembles d'apprentissage et de validation
            X_train_fold = X_train.iloc[train_index].values.tolist()
            X_val_fold = X_train.iloc[val_index].values.tolist()
            
            # Préparer le dictionnaire
            dictionary = Dictionary(X_train_fold)
            
            # Convertir le corpus en une représentation vectorielle (sac de mots)
            train_bow = [dictionary.doc2bow(doc) for doc in X_train_fold]
            val_bow = [dictionary.doc2bow(doc) for doc in X_val_fold]
            
            # Entraîner le modèle LDA
            lda_model = LdaModel(train_bow, num_topics=num_topics, id2word=dictionary)
            
            # Calculer la perplexité
            perplexity = lda_model.log_perplexity(val_bow)
            fold_perplexities.append(perplexity)
            
            # Calculer la cohérence des topics
            coherence_model = CoherenceModel(model=lda_model, texts=X_train_fold, corpus=train_bow, coherence='c_v')
            coherence_score = coherence_model.get_coherence()
            fold_coherence_scores.append(coherence_score)
        
        # Calculer les moyennes des perplexités et des scores de cohérence sur les plis
        mean_perplexity = sum(fold_perplexities) / num_folds
        mean_coherence = sum(fold_coherence_scores) / num_folds
        
        perplexities.append(mean_perplexity)
        coherence_scores.append(mean_coherence)
        
        print(f"Nombre de sujets : {num_topics}, Perplexité moyenne : {mean_perplexity}, Cohérence moyenne : {mean_coherence}")
    
    # Trouver le nombre optimal de sujets en utilisant une approche combinée
    combined_scores = [p + c for p, c in zip(perplexities, coherence_scores)]
    optimal_num_topics = num_topics_list[combined_scores.index(min(combined_scores))]
    
    return optimal_num_topics


In [60]:
optimal_num_topics = find_optimal_num_topics_cross_validation(X_train_title, 5)
print(f"Nombre optimal de sujets : {optimal_num_topics}")

Nombre de sujets : 5, Perplexité moyenne : -8.570237431842514, Cohérence moyenne : 0.24976374712519184
Nombre de sujets : 10, Perplexité moyenne : -9.183067306290457, Cohérence moyenne : 0.28131082861552675
Nombre de sujets : 20, Perplexité moyenne : -11.430595810729134, Cohérence moyenne : 0.3914530349685923
Nombre de sujets : 40, Perplexité moyenne : -14.44329729961251, Cohérence moyenne : 0.5247640666948264
Nombre de sujets : 100, Perplexité moyenne : -38.72994099597918, Cohérence moyenne : 0.41143931082603113
Nombre optimal de sujets : 100


In [61]:
# Step 5: Evaluation

# Initialize variables to keep track of evaluation metrics
true_positives = 0
false_positives = 0
false_negatives = 0

# Iterate through each document in the test set
for i in range(len(test_documents)):
    # Get the extracted keywords for the current document
    extracted_keywords = top_keywords_per_document[i]

    # Get the ground truth keywords for the current document
    ground_truth_keywords = ground_truth_keywords_per_document[i]

    # Calculate the number of true positives, false positives, and false negatives
    for keyword in extracted_keywords:
        if keyword in ground_truth_keywords:
            true_positives += 1
        else:
            false_positives += 1

    for keyword in ground_truth_keywords:
        if keyword not in extracted_keywords:
            false_negatives += 1

# Calculate evaluation metrics
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = (true_positives + true_negatives) / total_documents

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)
print("Accuracy:", accuracy)


NameError: name 'test_documents' is not defined

## 3.2 NMF + TF-IDF