# Classification supervisée des questions (approche bow)

## Import des librairies et des données

In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
import nltk
stop_words = nltk.corpus.stopwords.words("english")
for word in ['what', 'how', 'where', 'who', 'which', 'use', 'using'] :
    stop_words.append(word)
from string import punctuation

In [5]:
from bs4 import BeautifulSoup

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [8]:
import spacy

In [9]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [10]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [11]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, jaccard_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

In [12]:
from sklearn.multioutput import MultiOutputClassifier

In [13]:
file = open("top_10_tags.txt", "r")
top_10_tags = file.read()
top_10_tags = list(top_10_tags.split('\n')[:-1])
file.close()

In [14]:
data = pd.read_csv("data.csv")

## Échantillonnage et nettoyage des données

In [15]:
text = data['Title']
text_spl = text.sample(frac = 0.05).reset_index()
text_spl.head()

Unnamed: 0,index,Title
0,38867,Travel directory tree with limited recursion d...
1,17827,dictionaryWithContentsOfFile returning nil fro...
2,40362,"Logcat is being ""spammed"", resulting in ""Too m..."
3,7560,GCDAsynSocketDelegate didReadData method is no...
4,11097,Bootstrap Mobile Not Working


In [16]:
def preprocess(text) :

    """" Nettoyage du texte :
    passage au minuscule
    suppression du code éventuel du texte que l'on stocke dans une variable 'code'
    suppression et du contenu des balises autres que p (script, alt, ...)
    suppression des balises html
    conservation des textes labellisés par les top 30 tags uniquement
    suppression de la ponctuation, des chiffres,
    et des stopwords
    lemmatisation par spaCy """
    
    text = text.lower()
    
    soup = BeautifulSoup(text)
    
    if soup.find("code") :        
        code = soup.find("code").get_text()
        soup.find('code').clear()
    text_wo_tags = soup.get_text()
    
    for i in range(1, len(text_wo_tags)) :
        if text_wo_tags[i-1] == 'c' and text_wo_tags[i] == '#' :
            text_wo_tags = text_wo_tags.replace(text_wo_tags[i], 'sharp')
    
    token_list = nltk.word_tokenize(text_wo_tags)
    
    new_text = []
    
    for token in token_list :
        if token in top_10_tags :
            new_text.append(token)
        elif token not in stop_words :
            for char in token :
                if char in punctuation or char.isdigit() :
                    token = token.replace(char, '')
            new_text.append(token)
    
    lem = nltk.stem.WordNetLemmatizer()
    
    for token in new_text :
        if nltk.pos_tag([token])[0][1].startswith('V') :
            index = new_text.index(token)
            token_lem = lem.lemmatize(token, pos = 'v')
            new_text[index] = new_text[index].replace(token, token_lem)
            
    new_text = ' '.join(new_text)

    return new_text

In [17]:
print("Textes bruts :")
print("")
print(text_spl.loc[:11, 'Title'])
print("---------------------------------------")
print("Textes nettoyés :")
print("")
print(text_spl.loc[:11, 'Title'].apply(preprocess))

Textes bruts :

0     Travel directory tree with limited recursion d...
1     dictionaryWithContentsOfFile returning nil fro...
2     Logcat is being "spammed", resulting in "Too m...
3     GCDAsynSocketDelegate didReadData method is no...
4                          Bootstrap Mobile Not Working
5     Any way to make html5 audio responsive in Boot...
6                   How do compilers optimize our code?
7      20 Receives per second with SocketAsyncEventArgs
8     "sh: ./<file> not found" error when trying to ...
9     How to determine if Native JavaScript Object h...
10    Why is it slower to iterate over a small strin...
11    Is there a <meta> tag to turn off caching in a...
Name: Title, dtype: object
---------------------------------------
Textes nettoyés :

0         travel directory tree limited recursion depth
1     dictionarywithcontentsoffile return nil proper...
2        logcat  spammed   result  much output process 
3     gcdasynsocketdelegate didreaddata method call ...
4 



In [18]:
%%time
text_clean = text_spl['Title'].parallel_apply(preprocess)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=265), Label(value='0 / 265'))), HB…

CPU times: user 128 ms, sys: 81.4 ms, total: 210 ms
Wall time: 622 ms


In [19]:
text_spl['Title_clean'] = text_clean

In [20]:
data = pd.merge(data.iloc[text_spl['index']], text_spl)[['Title', 'Title_clean', 'Tags']]
data.head(3)

Unnamed: 0,Title,Title_clean,Tags
0,Travel directory tree with limited recursion d...,travel directory tree limited recursion depth,['python']
1,dictionaryWithContentsOfFile returning nil fro...,dictionarywithcontentsoffile return nil proper...,"['iphone', 'objective-c']"
2,"Logcat is being ""spammed"", resulting in ""Too m...",logcat spammed result much output process,['android']


## Classification supervisée

### Extraction de features par Bag-of-Words

In [21]:
vec = CountVectorizer()

In [22]:
X = vec.fit_transform(text_spl['Title'])

In [23]:
vecdf = pd.DataFrame(X.todense().tolist(), columns = vec.get_feature_names_out())
vecdf

Unnamed: 0,00,000,0000,01,04,0beta,0x,0x2,0x2efd,0x7f090047,...,yyyy,zend,zero,zone,zones,zoom,zoomed,zsh,ɵɵdefineinjectable,ɵɵinject
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2111,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2114,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df = data
data = vecdf.join(data)
#data['ispython'] = False
data = data[~data['Tags'].isna()]

### Encoding des tags

In [25]:
for tag in top_10_tags :
    data['is' + tag] = 0
    index = 0
    for doc_tag in data['Tags'] :
        if tag in doc_tag :
            data.loc[index, 'is' + tag] = 1
        index += 1

In [26]:
data.iloc[:5, -11:]

Unnamed: 0,Tags,isc#,isjava,isjavascript,ispython,isc++,isios,isandroid,is.net,ishtml,isphp
0,['python'],0,0,0,1,0,0,0,0,0,0
1,"['iphone', 'objective-c']",0,0,0,0,0,0,0,0,0,0
2,['android'],0,0,0,0,0,0,1,0,0,0
3,"['ios', 'iphone', 'objective-c']",0,0,0,0,0,1,0,0,0,0
4,['html'],0,0,0,0,0,0,0,0,1,0


### Essai de différents modèles de classification

In [27]:
X = data.iloc[:,:-13].values
y = data.iloc[:,-10:].values

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)

DummyClassifier()

In [30]:
dummy.score(X_train,y_train)

0.15808045165843332

In [31]:
dummy.score(X_test,y_test)

0.13304721030042918

In [32]:
def model_testing(X_train, X_test, y_train, y_test, model_list) :
    train_score = []
    test_score = []
    fit_time = []
    jaccard = []
    
    for model in model_list :
        start = time.time()
        mcmodel = MultiOutputClassifier(model)
        mcmodel.fit(X_train, y_train)
        stop = time.time()
        train_score.append(mcmodel.score(X_train, y_train))
        test_score.append(mcmodel.score(X_test, y_test))
        fit_time.append(stop - start)
        jaccard.append(jaccard_score(y_test, mcmodel.predict(X_test), average = 'micro'))
        
    train_score = pd.DataFrame(train_score, model_list, columns = ['train_score']).reset_index()
    test_score = pd.DataFrame(test_score, model_list, columns = ['test_score']).reset_index()
    fit_time = pd.DataFrame(fit_time, model_list, columns = ['fit_time']).reset_index()
    jaccard = pd.DataFrame(jaccard, model_list, columns = ['jaccard_score']).reset_index()
    result = pd.merge(train_score, test_score)
    result = pd.merge(result, fit_time)
    result = pd.merge(result, jaccard)
    
    return result

In [33]:
model_list = [LogisticRegression(), KNeighborsClassifier(), SVC(), RandomForestClassifier()]

In [34]:
%%time
model_testing(X_train, X_test, y_train, y_test, model_list)

CPU times: user 5min 41s, sys: 8.59 s, total: 5min 50s
Wall time: 2min 2s


Unnamed: 0,index,train_score,test_score,fit_time,jaccard_score
0,LogisticRegression(),0.757234,0.329041,1.138505,0.272216
1,KNeighborsClassifier(),0.318278,0.164521,0.002466,0.106842
2,SVC(),0.578687,0.251788,30.240991,0.156103
3,RandomForestClassifier(),0.997883,0.32618,8.263376,0.268124


### Vérification du modèle retenu

In [35]:
mclr = MultiOutputClassifier(LogisticRegression())

In [36]:
mclr.fit(X_train, y_train)

MultiOutputClassifier(estimator=LogisticRegression())

In [37]:
mclr.score(X_train, y_train)

0.7572335920959774

In [38]:
mclr.score(X_test, y_test)

0.3290414878397711

In [39]:
jaccard_score(y_test, mclr.predict(X_test), average = 'micro')

0.2722159730033746

In [40]:
data = data.iloc[:,-13:]

In [41]:
data

Unnamed: 0,Title,Title_clean,Tags,isc#,isjava,isjavascript,ispython,isc++,isios,isandroid,is.net,ishtml,isphp
0,Travel directory tree with limited recursion d...,travel directory tree limited recursion depth,['python'],0,0,0,1,0,0,0,0,0,0
1,dictionaryWithContentsOfFile returning nil fro...,dictionarywithcontentsoffile return nil proper...,"['iphone', 'objective-c']",0,0,0,0,0,0,0,0,0,0
2,"Logcat is being ""spammed"", resulting in ""Too m...",logcat spammed result much output process,['android'],0,0,0,0,0,0,1,0,0,0
3,GCDAsynSocketDelegate didReadData method is no...,gcdasynsocketdelegate didreaddata method call ...,"['ios', 'iphone', 'objective-c']",0,0,0,0,0,1,0,0,0,0
4,Bootstrap Mobile Not Working,bootstrap mobile work,['html'],0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2111,WPF - Compilation error: Tags of type 'Propert...,wpf compilation error tags type propertyarra...,['.net'],0,0,0,0,0,0,0,1,0,0
2112,How to create a dynamic LINQ join extension me...,create dynamic linq join extension method,"['c#', 'sql']",1,0,0,0,0,0,0,0,0,0
2113,Can't click Allow button in permission dialog ...,ca nt click allow button permission dialog and...,"['java', 'android']",0,1,0,0,0,0,1,0,0,0
2114,How to find sqlalchemy remote side object's cl...,find sqlalchemy remote side object s class cla...,"['python', 'sql']",0,0,0,1,0,0,0,0,0,0


In [42]:
def sup_predict(text, true_tags, vec = vec, tags_list = top_10_tags, model = mclr) :
    text_clean = preprocess(text)
    bow = vec.transform([text_clean]).toarray()
    pred = model.predict(bow)
    pred_tags = []
    for i in range(len(pred[0])) :
        if pred[0][i] == 1 :
            pred_tags.append(tags_list[i])
    return true_tags, pred_tags

In [43]:
result = data.apply(lambda x: sup_predict(x.Title_clean, x.Tags), axis=1)
result = pd.concat([pd.DataFrame(result.explode()[::2]), pd.DataFrame(result.explode()[1::2])], axis = 1)
result

Unnamed: 0,0,0.1
0,['python'],[python]
1,"['iphone', 'objective-c']",[]
2,['android'],[]
3,"['ios', 'iphone', 'objective-c']",[]
4,['html'],[]
...,...,...
2111,['.net'],[.net]
2112,"['c#', 'sql']",[c#]
2113,"['java', 'android']","[java, android]"
2114,"['python', 'sql']",[]
