# Classification non-supervisée de questions

## Import des librairies et des données

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
import nltk
stop_words = nltk.corpus.stopwords.words("english")
for word in ['what', 'how', 'where', 'who', 'which'] :
    stop_words.append(word)
from string import punctuation

In [4]:
from bs4 import BeautifulSoup

In [5]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
import spacy

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
file = open("top_10_tags.txt", "r")
top_10_tags = file.read()
top_10_tags = list(top_10_tags.split('\n')[:-1])
file.close()

In [None]:
data = pd.read_csv("data.csv")

## Échantillonnage et nettoyage des données

In [None]:
text = data['Title']
text_spl = text.sample(frac = 0.25).reset_index(drop = True)
text_spl.head()

In [None]:
print("Textes bruts :")
print("")
print(text_spl[:11])
print("---------------------------------------")
print("Textes nettoyés par Gensim :")
print("")
print(text_spl[:11].apply(simple_preprocess))

In [None]:
def lemmatization(texts, allowed_postags = ["NOUN", "VERB", "ADJ", "ADV"]) :
    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    texts_out = []
    for text in texts :
        doc = nlp(text)
        new_text = []
        for token in doc :
            if token.orth_ in top_10_tags :
                new_text.append(token.orth_)
            else :
                if token.pos_ in allowed_postags :
                    new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out

In [None]:
print("Textes bruts :")
print("")
print(text_spl[:11])
print("---------------------------------------")
print("Textes nettoyés par spaCy :")
print("")
print(pd.Series(lemmatization(text_spl[:11])))

In [None]:
def preprocess(text) :

    """" Nettoyage du texte :
    passage au minuscule
    suppression du code éventuel du texte que l'on stocke dans une variable 'code'
    suppression et du contenu des balises autres que p (script, alt, ...)
    suppression des balises html
    conservation des textes labellisés par les top 10 tags uniquement
    suppression de la ponctuation, des chiffres,
    et des stopwords
    lemmatisation par spaCy """
    
    text = text.lower()
    
    soup = BeautifulSoup(text)
    
    if soup.find("code") :        
        code = soup.find("code").get_text()
        soup.find('code').clear()
    text_wo_tags = soup.get_text()
    
    for i in range(1, len(text_wo_tags)) :
        if text_wo_tags[i-1] == 'c' and text_wo_tags[i] == '#' :
            text_wo_tags = text_wo_tags.replace(text_wo_tags[i], 'sharp')
    
    token_list = nltk.word_tokenize(text_wo_tags)
    
    new_text = []
    
    for token in token_list :
        if token in top_10_tags :
            new_text.append(token)
        elif token not in stop_words :
            for char in token :
                if char in punctuation or char.isdigit() :
                    token = token.replace(char, '')
            new_text.append(token)
    
    lem = nltk.stem.WordNetLemmatizer()
    
    for token in new_text :
        if nltk.pos_tag([token])[0][1].startswith('V') :
            index = new_text.index(token)
            token_lem = lem.lemmatize(token, pos = 'v')
            new_text[index] = new_text[index].replace(token, token_lem)
            
    new_text = ' '.join(new_text)

    return new_text

In [None]:
print("Textes bruts :")
print("")
print(text_spl[:11])
print("---------------------------------------")
print("Textes nettoyés par la fonction créée :")
print("")
print(text_spl[:11].apply(preprocess))

In [None]:
%%time
text_clean = text_spl.parallel_apply(preprocess)

In [None]:
text_spl = pd.DataFrame(text_spl)
text_clean = pd.DataFrame(text_clean)

In [None]:
data = pd.merge(data, text_spl, on = "Title", how = "right")
data = data.drop(columns = {'Body'})

In [None]:
data = pd.concat([data, text_clean], axis = 1)
data.columns = ['Title', 'Tags', 'Title_clean'] 

## Classification non-supervisée

### Feature extraction par Bag-of-Words (gensim)

In [None]:
words = []
for doc in text_clean :
    words.append(nltk.word_tokenize(doc))

In [None]:
id2word = corpora.Dictionary(words)
corpus = []

for word in words :
    corpus.append(id2word.doc2bow(word))

### Optimisation du nombre de topics (score de cohérence)

In [None]:
def coherence_table(corpus, dictionary, list_n):
    
    coherence_table = []
    
    for i in list_n :
    
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=i, 
                                               random_state=100,
                                               chunksize=100,
                                               passes=10)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=words, dictionary=id2word, coherence='c_v')

        coherence_table.append(coherence_model_lda.get_coherence())

    return coherence_table

In [None]:
%%time
n_topics_range = np.linspace(3, 30, 10)
table = coherence_table(corpus, id2word, n_topics_range)

In [None]:
sns.lineplot(y = table, x = n_topics_range).set(xlabel = "n_topics", ylabel = "Cohérence")
plt.title("Score de cohérence du modèle de LDA en fonction du nombre de topics")
plt.show()

### Clustering par Latent Dirichlet Allocation

In [None]:
num_topics = 15

lda = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100)

In [None]:
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus, id2word, mds = 'mmds', R=30)

### Extraction des tags trouvés par LDA

In [None]:
lda_df = []

for row in lda.show_topics(num_topics = 15) :
    for tag in top_10_tags :
        if tag in row[1] :
            lda_df.append([row[0], tag])
            
lda_df = pd.DataFrame(lda_df, columns = ['num_cluster', 'tag_lda'])

In [None]:
lda_df = pd.DataFrame(lda_df.groupby('num_cluster')['tag_lda'].apply(list)).reset_index()

In [None]:
lda_df

In [None]:
lda_cluster = []

for index, row in enumerate(lda[corpus]) :
    if len(row) < 15 :
        lda_cluster.append([index, row])

In [None]:
lda_cluster = pd.DataFrame(lda_cluster, columns = ['data_index', 'lda_cluster'])

In [None]:
lda_cluster = pd.DataFrame(lda_cluster.explode('lda_cluster').explode('lda_cluster')[::2].groupby('data_index')['lda_cluster'].apply(
    list)).reset_index()

In [None]:
data = data.loc[lda_cluster['data_index'].values.tolist()]

In [None]:
data = pd.merge(data, lda_cluster, left_index = True, right_on = "data_index").drop(columns = 'data_index')

In [None]:
def find_topics(data) :

    lda_tags = []

    for cluster in range(len(lda_df['num_cluster'])) :
        if cluster in data :
            tmp = []
            tmp.append(lda_df.loc[cluster, 'tag_lda'])
            tmp = pd.Series(tmp).explode().to_list()
            lda_tags = lda_tags + tmp
            
    return lda_tags

In [None]:
data['lda_tags'] = data['lda_cluster'].apply(find_topics)
data = data.drop(columns = {'lda_cluster'})

In [None]:
data