# Classification supervisée des questions (approche doc2vec)

## Import des librairies et des données

In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
import nltk
stop_words = nltk.corpus.stopwords.words("english")
for word in ['what', 'how', 'where', 'who', 'which'] :
    stop_words.append(word)
from string import punctuation

In [5]:
from bs4 import BeautifulSoup

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [7]:
import spacy

In [8]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

In [10]:
from sklearn.multioutput import MultiOutputClassifier

In [11]:
file = open("top_10_tags.txt", "r")
top_10_tags = file.read()
top_10_tags = list(top_10_tags.split('\n')[:-1])
file.close()

In [12]:
data = pd.read_csv("data.csv")

In [13]:
text = data['Title']
text_spl = text.sample(frac = 0.25).reset_index()
text_spl.head()

Unnamed: 0,index,Title
0,10343,Understanding the use of @ModelAttribute and @...
1,20428,How do streaming videos work?
2,33628,Converting procedural PHP into object-oriented...
3,24615,gcc - how to find path of header include file
4,41137,Every time I try to deploy I get - (gcloud.pre...


## Nettoyage des données

In [14]:
def preprocess(text) :

    """" Nettoyage du texte :
    passage au minuscule
    suppression du code éventuel du texte que l'on stocke dans une variable 'code'
    suppression et du contenu des balises autres que p (script, alt, ...)
    suppression des balises html
    conservation des textes labellisés par les top 10 tags uniquement
    suppression de la ponctuation, des chiffres,
    et des stopwords
    lemmatisation par spaCy """
    
    text = text.lower()
    
    soup = BeautifulSoup(text)
    
    if soup.find("code") :        
        code = soup.find("code").get_text()
        soup.find('code').clear()
    text_wo_tags = soup.get_text()
    
    for i in range(1, len(text_wo_tags)) :
        if text_wo_tags[i-1] == 'c' and text_wo_tags[i] == '#' :
            text_wo_tags = text_wo_tags.replace(text_wo_tags[i], 'sharp')
    
    token_list = nltk.word_tokenize(text_wo_tags)
    
    new_text = []
    
    for token in token_list :
        if token in top_10_tags :
            new_text.append(token)
        elif token not in stop_words :
            for char in token :
                if char in punctuation or char.isdigit() :
                    token = token.replace(char, '')
            new_text.append(token)
    
    lem = nltk.stem.WordNetLemmatizer()
    
    for token in new_text :
        if nltk.pos_tag([token])[0][1].startswith('V') :
            index = new_text.index(token)
            token_lem = lem.lemmatize(token, pos = 'v')
            new_text[index] = new_text[index].replace(token, token_lem)
            
    new_text = ' '.join(new_text)

    return new_text

In [15]:
print("Textes bruts :")
print("")
print(text_spl.loc[:11, 'Title'])
print("---------------------------------------")
print("Textes nettoyés :")
print("")
print(text_spl.loc[:11, 'Title'].apply(preprocess))

Textes bruts :

0     Understanding the use of @ModelAttribute and @...
1                         How do streaming videos work?
2     Converting procedural PHP into object-oriented...
3         gcc - how to find path of header include file
4     Every time I try to deploy I get - (gcloud.pre...
5     Starting and stopping IIS Express programmatic...
6            Writing a very basic search form in Django
7             Moving from ints to GUIDs as primary keys
8     How to convert the file/s upload request made ...
9               How to clean up after subprocess.Popen?
10                              Egyptian Fractions in C
11    How to install the Raspberry Pi cross compiler...
Name: Title, dtype: object
---------------------------------------
Textes nettoyés :

0     understand use  modelattribute  requestattribu...
1                                   stream videos work 
2             convert procedural php objectoriented php
3                    gcc  find path header include file
4 



In [16]:
%%time
text_clean = text_spl['Title'].parallel_apply(preprocess)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1323), Label(value='0 / 1323'))), …

CPU times: user 223 ms, sys: 115 ms, total: 338 ms
Wall time: 2.16 s


In [17]:
text_spl['Title_clean'] = text_clean

In [18]:
data = pd.merge(data.iloc[text_spl['index']], text_spl)[['Title', 'Title_clean', 'Tags']]
data.head(3)

Unnamed: 0,Title,Title_clean,Tags
0,Understanding the use of @ModelAttribute and @...,understand use modelattribute requestattribu...,"['java', 'spring']"
1,How do streaming videos work?,stream videos work,"['java', 'javascript']"
2,Converting procedural PHP into object-oriented...,convert procedural php objectoriented php,['php']


## Feature extraction : Doc2Vec embedding

In [19]:
tokenized_docs = text_clean.apply(nltk.word_tokenize).tolist()

In [20]:
tagged_docs = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(tokenized_docs)]

In [21]:
%%time
model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4, epochs=100)

CPU times: user 37.3 s, sys: 14.9 s, total: 52.2 s
Wall time: 39 s


In [22]:
%%time
doc_vectors = [model.infer_vector(doc) for doc in tokenized_docs]

CPU times: user 18.9 s, sys: 51.9 ms, total: 19 s
Wall time: 19 s


In [23]:
doc_vectors = pd.DataFrame(doc_vectors)
doc_vectors.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.815969,0.568768,0.028431,0.272409,0.476963,-0.102974,0.827024,0.409035,-0.24358,-0.50276,...,0.101257,-0.371736,0.095607,-0.227959,1.288437,0.168936,0.188166,-0.030234,0.326549,0.510829
1,0.380908,-0.332142,-0.018626,0.151518,0.237649,0.001713,-0.108884,0.408431,-0.304398,0.079491,...,0.260453,0.17252,-0.077629,0.027335,0.398056,0.179121,0.149117,0.031777,0.216352,0.549914
2,0.297508,-0.073496,0.547482,-0.141867,-0.919239,-0.453138,0.393963,0.591184,-0.3164,0.327702,...,0.308404,-0.208561,-0.020768,0.577534,0.391925,-0.765707,0.326047,-0.414904,0.115831,0.777314


In [24]:
data = doc_vectors.join(data)

## Encoding des tags

In [25]:
for tag in top_10_tags :
    data['is' + tag] = 0
    index = 0
    for doc_tag in data['Tags'] :
        if tag in doc_tag :
            data.loc[index, 'is' + tag] = 1
        index += 1

## Classification supervisée : MultiOutput Logistic Regression

In [26]:
X = data.iloc[:,:-13].values
y = data.iloc[:,-10:].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [28]:
%%time
mclr = MultiOutputClassifier(LogisticRegression(max_iter = 1000)).fit(X_train, y_train)

CPU times: user 591 ms, sys: 243 ms, total: 834 ms
Wall time: 180 ms


## Scores

In [29]:
mclr.score(X_train, y_train)

0.2819246507690137

In [30]:
mclr.score(X_test, y_test)

0.26955027212833

In [31]:
jaccard_score(y_test, mclr.predict(X_test), average = 'micro')

0.1760341726618705