In [5]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import string
import re
import spacy
from spacy.tokenizer import Tokenizer
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim

from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

from joblib import Parallel, delayed

In [None]:
# ! pip install --user pyLDAvis

In [2]:
data = pd.read_csv('corpus_and_tags.csv')
data.head()

Unnamed: 0,Question,Tags
0,How to convert Decimal to Double in C#? I want...,c# floating-point type-conversion double decimal
1,Calculate relative time in C# Given a specific...,c# datetime time datediff relative-time-span
2,Determine a user's timezone Is there a standar...,html browser timezone user-agent timezone-offset
3,What is the fastest way to get the value of π?...,performance algorithm language-agnostic unix pi
4,How to use the C socket API in C++ on z/OS I'm...,c++ c sockets mainframe zos


In [4]:
X = data['Question']
y = data['Tags']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
data.info()

In [None]:
# import nltk
# import string
# from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('words')
# nltk.download('wordnet')

In [None]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [None]:
# nltk.download('omw-1.4')


In [6]:
nlp = spacy.load('en_core_web_sm')
# nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[(\,\?\:\;\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffixes = list(nlp.Defaults.suffixes)
    suffixes.remove("#")
    suffix_re = compile_suffix_regex(suffixes)
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp.tokenizer = custom_tokenizer(nlp)
stopwords_list = stopwords.words('english')

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def preprocess_doc(doc):
#     preprocess_list = []
#     for doc in listofdocs :           
#             print( token.pos_)# if token.pos_ in ['VERB','NOUN','X','PROPN']
#         tokenize_doc = gensim.utils.simple_preprocess(doc ,min_len=1)
    tokenize_doc = [token.lemma_ for token in nlp(doc) if token.pos_ in ['VERB','NOUN','X','PROPN']]
    tokenize_doc_w_punct = [token.lower() for token in tokenize_doc
                                if token not in string.punctuation]
    tokenize_doc_w_num = [token for token in tokenize_doc_w_punct
                              if not is_number(token)]
    tokenize_doc_w_longw = [token for token in tokenize_doc_w_num
                               if len(token)<=40]
    tokenize_doc_w_stopw = [token for token in tokenize_doc_w_longw
                               if token not in stopwords_list]
    tokenize_doc_w_noprint = [token for token in tokenize_doc_w_stopw
                               if token.isprintable()]

#     preprocess_list.append(tokenize_doc_w_noprint)

    return tokenize_doc_w_noprint

In [None]:
preprocess_list_of_docs(['c++ la .la fra-nce.ko chien une \n'])

In [8]:
X_train=X_train.reset_index(drop=True)
preprocess_list = Parallel(n_jobs=22, prefer="threads")(delayed(preprocess_doc)(X_train[i]) for i in range(len(X_train)))
print('Phrase de base : '+X_train[1])
print(preprocess_list[1])

Phrase de base : How to expose std::pair to python using boost::python? How to expose std::pair to python using boost::python? When I expose for example vector<string> I simply write:
class_<std::vector<std::string> >("StringVec")
    .def(vector_indexing_suite<std::vector<std::string> >())
;

But I don't know how to deal with std::pair.

['expose', 'std', 'pair', 'python', 'use', 'boost', 'python', 'expose', 'std', 'pair', 'python', 'use', 'boost', 'python', 'expose', 'example', 'vector<stre', 'write', 'class_<std', 'vector<std', 'string', 'stringvec', 'vector_indexing_suite<std', 'vector<std', 'string', 'know', 'deal', 'std', 'pair']


In [32]:
id2word = corpora.Dictionary(preprocess_list)
print(len(id2word))
id2word.filter_extremes(no_below=10,no_above=0.2,keep_n=None)
print(len(id2word))


1190386
39701


In [33]:
# id2word = corpora.Dictionary(preprocess_list)

corpus=[]
for text in preprocess_list:
    new = id2word.doc2bow(text)
    corpus.append(new)
    
print(corpus[1])
[(id2word[id], count) for id, count in corpus[1]]

[(12, 1), (30, 2), (31, 1), (32, 1), (33, 3), (34, 3), (35, 4), (36, 3), (37, 2), (38, 2), (39, 1), (40, 1)]


[('know', 1),
 ('boost', 2),
 ('deal', 1),
 ('example', 1),
 ('expose', 3),
 ('pair', 3),
 ('python', 4),
 ('std', 3),
 ('string', 2),
 ('vector<std', 2),
 ('vector<stre', 1),
 ('write', 1)]

In [23]:
[(id2word[id], count) for id, count in corpus[1]]

[('boost', 2),
 ('class_<std', 1),
 ('expose', 3),
 ('pair', 3),
 ('stringvec', 1),
 ('vector<std', 2),
 ('vector<stre', 1),
 ('vector_indexing_suite<std', 1)]

In [None]:
#tf-idf removal
from gensim.models import TfidfModel

tfidf = TfidfModel(corpus, id2word=id2word)
low_value = 0.2
words=[]
words_missing_in_tfidf=[]
for i in range(0,len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id,value in bow]
    low_value_words = [id for id,value in tfidf[bow] if value < low_value]
#     drops = low_value_words+words_missing_in_tfidf
#     for items in drops:
#         words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [None]:
corpus[1001]

In [None]:
import tqdm


In [None]:
corpus

In [None]:
from collections.abc import Iterable
isinstance(corpus, Iterable)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [34]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                            workers=11,
                                           num_topics=300,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,)

In [None]:
def extract_topics_keywords(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = " ".join([word for word, prop in wp[0:5]])
                results = {'Dominant_Topic':topic_num,
                           'Perc_Contribution':round(prop_topic,4),
                           'Topic_Keywords':topic_keywords
                          }
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([results])], ignore_index=True)
            else:
                break

    return(sent_topics_df)

topics_df = extract_topics_keywords(ldamodel=lda_model, corpus=corpus)
topics_df = pd.concat([topics_df, data['Tags'][0:10000]], axis=1)
topics_df

In [None]:
tt=sorted(lda_model.get_document_topics(corpus[0]), key=lambda x: (x[1]), reverse=True)
lda_model.show_topic(tt[0][0])

In [None]:
lda_model.get_document_topics(corpus[1])


In [None]:
corpus[1]

In [40]:
def extract_topics_keywords(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for bow in corpus:
        topics_sorted = sorted(ldamodel.get_document_topics(bow), key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        if topics_sorted:
            wp = ldamodel.show_topic(topics_sorted[0][0])
            topic_keywords = " ".join([word for word, prop in wp[0:5]])
            results = {'Dominant_Topic':topics_sorted[0][0],
                           'Perc_Contribution':round(topics_sorted[0][1],4),
                           'Topic_Keywords':topic_keywords
                          }
            sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([results])], ignore_index=True)
        else:
            results = {'Dominant_Topic':np.Nan,
                           'Perc_Contribution':np.Nan,
                           'Topic_Keywords':np.Nan
                          }
            sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([results])], ignore_index=True)
    return(sent_topics_df)

# topics_df = extract_topics_keywords(ldamodel=lda_model, corpus=corpus)
# topics_df = pd.concat([topics_df, data['Tags'][0:10000]], axis=1)
# topics_df

In [None]:
topics_df[7604:7750]

In [36]:
X_test=X_test.reset_index(drop=True)
X_test_preprocessed = Parallel(n_jobs=22, prefer="threads")(delayed(preprocess_doc)(X_test[i]) for i in range(len(X_test)))
print('Phrase de base : '+X_test[1])
print(X_test_preprocessed[1])

corpus_test=[]
for text in X_test_preprocessed:
    new = id2word.doc2bow(text)
    corpus_test.append(new)
    
print(corpus_test[1])
[(id2word[id], count) for id, count in corpus_test[1]]

Phrase de base : Conditional unique constraint with multiple fields in oracle db I have this table:
XPTO_TABLE (id, obj_x, date_x, type_x, status_x)

I wanna create a unique constraint that applies to the fields (obj_x, date_x, type_x) only when status_x <> 5.
I have tried to create this one but Oracle says:
line 1: ORA-00907: missing right parenthesis

CREATE UNIQUE INDEX UN_OBJ_DT_TYPE_STATUS
ON XPTO_TABLE(
    (CASE
         WHEN STATUS_X <> 5
         THEN
             (OBJ_X,
              TO_CHAR (DATE_X, 'dd/MM/yyyy'),
              TYPE_X)
         ELSE
             NULL
     END));

What's the correct syntax ?

['constraint', 'field', 'oracle', 'db', 'table', 'obj_x', 'date_x', 'wanna', 'create', 'constraint', 'apply', 'field', 'obj_x', 'date_x', 'try', 'create', 'oracle', 'say', 'line', 'ora-00907', 'miss', 'parenthesis', 'create', 'unique', 'index', 'xpto_table', 'case', 'obj_x', 'to_char', 'date_x', 'dd/mm/yyyy', 'type_x', 'else', 'null', 'end', 'syntax']
[(7, 2), (21, 1), 

[('constraint', 2),
 ('say', 1),
 ('end', 1),
 ('index', 1),
 ('case', 1),
 ('null', 1),
 ('table', 1),
 ('unique', 1),
 ('line', 1),
 ('db', 1),
 ('apply', 1),
 ('field', 2),
 ('miss', 1),
 ('syntax', 1),
 ('parenthesis', 1),
 ('oracle', 2),
 ('to_char', 1),
 ('else', 1),
 ('dd/mm/yyyy', 1),
 ('wanna', 1)]

In [42]:
topics_df = extract_topics_keywords(ldamodel=lda_model, corpus=corpus_test)
# topics_df = pd.concat([topics_df, y_test], axis=1,ignore_index = True)
topics_df

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,60,0.1965,integer bit n j max
1,189,0.1399,field column rate equivalent eg
2,218,0.1428,array search label icon escape
3,114,0.2017,page content counter helper user_id
4,277,0.2116,learn question know tutorial apache
...,...,...,...
44296,277,0.2157,learn question know tutorial apache
44297,110,0.2142,test unit expect fail write
44298,59,0.1317,import module package class processing
44299,274,0.3064,cell description tableview uitableview section


In [59]:
topics_df = pd.concat([topics_df, y_test.to_frame().reset_index(drop=True)],ignore_index=False,axis=1)
topics_df

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Tags
0,60,0.1965,integer bit n j max,java recursion lambda java-8 fibonacci
1,189,0.1399,field column rate equivalent eg,sql oracle conditional-statements unique uniqu...
2,218,0.1428,array search label icon escape,powershell escaping character unc select-string
3,114,0.2017,page content counter helper user_id,iphone ios image-processing uiimagepickercontr...
4,277,0.2116,learn question know tutorial apache,javascript model-view-controller web-applicati...
...,...,...,...,...
44296,277,0.2157,learn question know tutorial apache,javascript node.js proxy websocket push-notifi...
44297,110,0.2142,test unit expect fail write,c# unit-testing mocking tdd moq
44298,59,0.1317,import module package class processing,java spring spring-mvc spring-boot spring-data...
44299,274,0.3064,cell description tableview uitableview section,php wordpress woocommerce product hook-woocomm...


In [61]:
from simphile import jaccard_list_similarity
l=[]
for i in range(len(topics_df[0:1000])):
    l.append(jaccard_list_similarity(topics_df['Topic_Keywords'][i].split(), topics_df['Tags'][i].split()))
np.mean(l)

0.027777777777777776

In [35]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model,corpus, id2word, mds='mmds', R=30)
vis

  default_term_info = default_term_info.sort_values(
