In [26]:
import pandas as pd
import numpy as np
import time

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words("english")
for word in ['what', 'how', 'where', 'who', 'which'] :
    stop_words.append(word)
from string import punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
import spacy

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.multioutput import MultiOutputClassifier

In [13]:
file = open("/content/top_10_tags.txt", "r")
top_10_tags = file.read()
top_10_tags = list(top_10_tags.split('\n')[:-1])
file.close()

In [16]:
data = pd.read_csv("/content/data.csv")

In [17]:
text = data['Title']
text_spl = text.sample(frac = 0.25).reset_index()
text_spl.head()

Unnamed: 0,index,Title
0,7188,how to access downloads folder in android?
1,20129,What's the equivalent of Windows' QueryPerform...
2,20625,"Using hibernate criteria, is there a way to es..."
3,28125,Possible to create a single multi-type collect...
4,32800,How to see output of print statements when scr...


In [18]:
def preprocess(text) :

    """" Nettoyage du texte :
    passage au minuscule
    suppression du code éventuel du texte que l'on stocke dans une variable 'code'
    suppression et du contenu des balises autres que p (script, alt, ...)
    suppression des balises html
    conservation des textes labellisés par les top 10 tags uniquement
    suppression de la ponctuation, des chiffres,
    et des stopwords
    lemmatisation par spaCy """
    
    text = text.lower()
    
    for i in range(1, len(text)) :
        if text[i-1] == 'c' and text[i] == '#' :
            text = text.replace(text[i], 'sharp')
    
    token_list = nltk.word_tokenize(text)
    
    new_text = []
    
    for token in token_list :
        if token in top_10_tags :
            new_text.append(token)
        elif token not in stop_words :
            for char in token :
                if char in punctuation or char.isdigit() :
                    token = token.replace(char, '')
            new_text.append(token)
    
    lem = nltk.stem.WordNetLemmatizer()
    
    for token in new_text :
        if nltk.pos_tag([token])[0][1].startswith('V') :
            index = new_text.index(token)
            token_lem = lem.lemmatize(token, pos = 'v')
            new_text[index] = new_text[index].replace(token, token_lem)
            
    new_text = ' '.join(new_text)

    return new_text

In [19]:
print("Textes bruts :")
print("")
print(text_spl.loc[:11, 'Title'])
print("---------------------------------------")
print("Textes nettoyés :")
print("")
print(text_spl.loc[:11, 'Title'].apply(preprocess))

Textes bruts :

0            how to access downloads folder in android?
1     What's the equivalent of Windows' QueryPerform...
2     Using hibernate criteria, is there a way to es...
3     Possible to create a single multi-type collect...
4     How to see output of print statements when scr...
5     How to horizontally center a floating element ...
6     Django models - how to filter out duplicate va...
7                          An analog to rnorm in python
8     How does "get_user_pages" work (For linux driver)
9     Django: How to access URL regex parameters ins...
10    How to achieve desired results when using the ...
11    Generating a 'Hello, World!' class with the Ja...
Name: Title, dtype: object
---------------------------------------
Textes nettoyés :

0                      access downloads folder android 
1     s equivalent windows  queryperformancecounter ...
2     use hibernate criteria  way escape special cha...
3     possible create single multitype collection mu...
4 

In [20]:
%%time
text_clean = text_spl['Title'].apply(preprocess)

CPU times: user 15.6 s, sys: 744 ms, total: 16.3 s
Wall time: 17.4 s


In [21]:
text_spl['Title_clean'] = text_clean

In [22]:
data = pd.merge(data.iloc[text_spl['index']], text_spl)[['Title', 'Title_clean', 'Tags']]
data.head(3)

Unnamed: 0,Title,Title_clean,Tags
0,how to access downloads folder in android?,access downloads folder android,"['java', 'android']"
1,What's the equivalent of Windows' QueryPerform...,s equivalent windows queryperformancecounter ...,"['c++', 'windows']"
2,"Using hibernate criteria, is there a way to es...",use hibernate criteria way escape special cha...,['java']


In [28]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [23]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    time2 = np.round(time.time() - time1,0)
    return features

In [24]:
batch_size = 10
sentences = data['Title_clean'].to_list()

In [29]:
features_USE = feature_USE_fct(sentences, batch_size)

In [32]:
data = data.join(pd.DataFrame(features_USE))

In [85]:
for tag in top_10_tags :
    data['is' + tag] = 0
    index = 0
    for doc_tag in data['Tags'] :
        if not pd.isnull(doc_tag) :
          if tag in doc_tag :
            data.loc[index, 'is' + tag] = 1
        index += 1

In [86]:
data

Unnamed: 0,Title,Title_clean,Tags,0,1,2,3,4,5,6,...,isc#,isjava,isjavascript,ispython,isc++,isios,isandroid,is.net,ishtml,isphp
0,how to access downloads folder in android?,access downloads folder android,"['java', 'android']",-0.008625,0.066267,0.059288,0.047522,0.012286,0.021743,0.037077,...,0,1,0,0,0,0,1,0,0,0
1,What's the equivalent of Windows' QueryPerform...,s equivalent windows queryperformancecounter ...,"['c++', 'windows']",0.040289,0.002106,-0.033103,0.044113,0.067805,-0.055370,-0.024531,...,0,0,0,0,1,0,0,0,0,0
2,"Using hibernate criteria, is there a way to es...",use hibernate criteria way escape special cha...,['java'],-0.053190,-0.028141,0.006160,-0.011546,0.055744,0.011479,-0.014696,...,0,1,0,0,0,0,0,0,0,0
3,Possible to create a single multi-type collect...,possible create single multitype collection mu...,"['c#', '.net']",0.024036,-0.051749,0.034761,-0.026690,0.031129,0.065448,-0.007786,...,1,0,0,0,0,0,0,1,0,0
4,How to see output of print statements when scr...,see output print statements scrapy logger enable,['python'],0.069326,0.052986,-0.055654,0.015319,-0.030124,0.054453,-0.025153,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10573,List throws ConcurrentModificationException bu...,list throws concurrentmodificationexception se...,['java'],,,,,,,,...,0,1,0,0,0,0,0,0,0,0
10574,How to call a local web service from an Androi...,call local web service android mobile application,"['.net', 'android']",,,,,,,,...,0,0,0,0,0,0,1,1,0,0
10575,ReSharper giving C# 3.0 Code Inspection Warnin...,resharper give csharp code inspection warning...,"['c#', '.net', 'asp.net']",,,,,,,,...,1,0,0,0,0,0,0,1,0,0
10576,OpenGL fbo blitting inconsistent between Intel...,opengl fbo blitting inconsistent intel nvidia,['java'],,,,,,,,...,0,1,0,0,0,0,0,0,0,0


In [87]:
X = data.iloc[:,3:-10]
y = data.iloc[:,-10:]

In [88]:
X = X[~X[0].isna()]
y = y.iloc[X.index]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [90]:
%%time
mclr = MultiOutputClassifier(LogisticRegression(max_iter = 1000)).fit(X_train, y_train)

CPU times: user 4.13 s, sys: 1.26 s, total: 5.39 s
Wall time: 3.45 s


In [91]:
mclr.score(X_train, y_train)

0.5352351362801864

In [92]:
mclr.score(X_test, y_test)

0.5227858985382631

In [94]:
jaccard_score(y_test, mclr.predict(X_test), average = 'micro')

0.4939236111111111