In [7]:
import pandas as pd
import numpy as np
import time

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words("english")
for word in ['what', 'how', 'where', 'who', 'which'] :
    stop_words.append(word)
from string import punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
import spacy

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

In [11]:
from sklearn.multioutput import MultiOutputClassifier

In [12]:
file = open("/content/top_10_tags.txt", "r")
top_10_tags = file.read()
top_10_tags = list(top_10_tags.split('\n')[:-1])
file.close()

In [13]:
data = pd.read_csv("/content/data.csv")

In [14]:
text = data['Title']
text_spl = text.sample(frac = 0.25).reset_index()
text_spl.head()

Unnamed: 0,index,Title
0,12112,Contradictory results between GCC and clang re...
1,33062,Storing JSON in database vs. having a new colu...
2,5625,How to allocate and deallocate heap memory for...
3,25472,Parsing C# code (as string) and inserting addi...
4,26848,Read-Only Field in Django Form


In [15]:
def preprocess(text) :

    """" Nettoyage du texte :
    passage au minuscule
    suppression du code éventuel du texte que l'on stocke dans une variable 'code'
    suppression et du contenu des balises autres que p (script, alt, ...)
    suppression des balises html
    conservation des textes labellisés par les top 10 tags uniquement
    suppression de la ponctuation, des chiffres,
    et des stopwords
    lemmatisation par spaCy """
    
    text = text.lower()
    
    for i in range(1, len(text)) :
        if text[i-1] == 'c' and text[i] == '#' :
            text = text.replace(text[i], 'sharp')
    
    token_list = nltk.word_tokenize(text)
    
    new_text = []
    
    for token in token_list :
        if token in top_10_tags :
            new_text.append(token)
        elif token not in stop_words :
            for char in token :
                if char in punctuation or char.isdigit() :
                    token = token.replace(char, '')
            new_text.append(token)
    
    lem = nltk.stem.WordNetLemmatizer()
    
    for token in new_text :
        if nltk.pos_tag([token])[0][1].startswith('V') :
            index = new_text.index(token)
            token_lem = lem.lemmatize(token, pos = 'v')
            new_text[index] = new_text[index].replace(token, token_lem)
            
    new_text = ' '.join(new_text)

    return new_text

In [16]:
print("Textes bruts :")
print("")
print(text_spl.loc[:11, 'Title'])
print("---------------------------------------")
print("Textes nettoyés :")
print("")
print(text_spl.loc[:11, 'Title'].apply(preprocess))

Textes bruts :

0     Contradictory results between GCC and clang re...
1     Storing JSON in database vs. having a new colu...
2     How to allocate and deallocate heap memory for...
3     Parsing C# code (as string) and inserting addi...
4                        Read-Only Field in Django Form
5     There is in Windows file systems a pre compute...
6                                  Asynchronous Logging
7     Rails: Your user account isn't allowed to inst...
8              CSS:after encoding characters in content
9         Vuejs binding not working if update by jquery
10    EXTENDS challenge: preprocessor function macro...
11    Matlab: Converting a double vector array to st...
Name: Title, dtype: object
---------------------------------------
Textes nettoyés :

0     contradictory results gcc clang related  basic...
1                 store json database vs new column key
2              allocate deallocate heap memory d array 
3     parse csharp code  string  insert additional m...
4 

In [17]:
%%time
text_clean = text_spl['Title'].apply(preprocess)

CPU times: user 13.1 s, sys: 649 ms, total: 13.7 s
Wall time: 13.8 s


In [18]:
text_spl['Title_clean'] = text_clean

In [19]:
data = pd.merge(data.iloc[text_spl['index']], text_spl)[['Title', 'Title_clean', 'Tags']]
data.head(3)

Unnamed: 0,Title,Title_clean,Tags
0,Contradictory results between GCC and clang re...,contradictory results gcc clang related basic...,['c++']
1,Storing JSON in database vs. having a new colu...,store json database vs new column key,"['sql', 'mysql', 'sql-server']"
2,How to allocate and deallocate heap memory for...,allocate deallocate heap memory d array,['arrays']


In [20]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [21]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    time2 = np.round(time.time() - time1,0)
    return features

In [22]:
batch_size = 10
sentences = data['Title_clean'].to_list()

In [23]:
features_USE = feature_USE_fct(sentences, batch_size)

In [24]:
data = data.join(pd.DataFrame(features_USE))

In [25]:
for tag in top_10_tags :
    data['is' + tag] = 0
    index = 0
    for doc_tag in data['Tags'] :
        if not pd.isnull(doc_tag) :
          if tag in doc_tag :
            data.loc[index, 'is' + tag] = 1
        index += 1

In [26]:
data

Unnamed: 0,Title,Title_clean,Tags,0,1,2,3,4,5,6,...,isc#,isjava,isjavascript,ispython,isc++,isios,isandroid,is.net,ishtml,isphp
0,Contradictory results between GCC and clang re...,contradictory results gcc clang related basic...,['c++'],0.027464,-0.026904,-0.020193,0.023409,0.066413,-0.073831,0.030340,...,0,0,0,0,1,0,0,0,0,0
1,Storing JSON in database vs. having a new colu...,store json database vs new column key,"['sql', 'mysql', 'sql-server']",-0.005332,-0.076616,0.056832,0.046624,0.030570,0.060006,-0.025749,...,0,0,0,0,0,0,0,0,0,0
2,How to allocate and deallocate heap memory for...,allocate deallocate heap memory d array,['arrays'],0.071566,-0.053883,0.068859,-0.047288,-0.055014,-0.013951,0.040101,...,0,0,0,0,0,0,0,0,0,0
3,Parsing C# code (as string) and inserting addi...,parse csharp code string insert additional m...,['c#'],0.003378,-0.032887,0.050277,0.004372,-0.050552,0.078786,-0.052920,...,1,0,0,0,0,0,0,0,0,0
4,Read-Only Field in Django Form,readonly field django form,['python'],-0.009102,0.011841,0.035331,-0.039588,-0.006356,0.070024,0.058150,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10573,SqlException (0x80131904): Invalid object name...,sqlexception x invalid object name dbocateg...,"['c#', 'sql', '.net', 'asp.net-mvc', 'sql-serv...",,,,,,,,...,1,0,0,0,0,0,0,1,0,0
10574,How to throttle login attempts - PHP & MySQL &...,throttle login attempts php mysql codeigniter,"['php', 'sql', 'mysql']",,,,,,,,...,0,0,0,0,0,0,0,0,0,1
10575,Java REST service using authentication token,java rest service use authentication token,['java'],,,,,,,,...,0,1,0,0,0,0,0,0,0,0
10576,CKEditor 5 insert image by external url,ckeditor insert image external url,"['java', 'javascript', '.net', 'asp.net']",,,,,,,,...,0,1,1,0,0,0,0,1,0,0


In [27]:
X = data.iloc[:,3:-10]
y = data.iloc[:,-10:]

In [28]:
X = X[~X[0].isna()]
y = y.iloc[X.index]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [30]:
%%time
mclr = MultiOutputClassifier(LogisticRegression(max_iter = 1000)).fit(X_train, y_train)

CPU times: user 5.88 s, sys: 1.63 s, total: 7.51 s
Wall time: 5.13 s


In [31]:
mclr.score(X_train, y_train)

0.5425787318175399

In [32]:
mclr.score(X_test, y_test)

0.5224992834623101

In [33]:
jaccard_score(y_test, mclr.predict(X_test), average = 'micro')

0.4867178924259056

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [64]:
transformer = FunctionTransformer(feature_USE_fct, kw_args={'b_size':10}, validate = True)

In [71]:
transformer.fit(X_train, y_train)
mclr.fit(X_train, y_train)

In [74]:
pipe = Pipeline([('USE', transformer), ('MultiLogReg', mclr)], verbose = True)

In [34]:
from joblib import dump

In [110]:
dump(pipe, 'trained_use_logreg.joblib')

['trained_use_logreg.joblib']