In [1]:
import importlib
import os
from joblib import dump, load
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk import download as nltk_download
from nltk import tokenize, RegexpTokenizer
from nltk.corpus import stopwords
'''Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.'''
nltk_download('punkt')
nltk_download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

import ml.explo as mlexplo
import ml.prepare as mlprepare

from nltk.stem.snowball import SnowballStemmer

# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

data_path = "..\\data\\stackoverflow\\"
model_path = data_path + "models\\"

np.set_printoptions(precision=2, suppress=True)
pd.options.display.float_format = "{:,.2f}".format

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_filename = "OC_DS_P6.csv"
tags_filename = "Stackoverflow_top_tags.csv"
init = False

In [3]:
def apply_bs4(raw_text):
#     return '. '.join(
#         [x.find(text=True)
#          for x in BeautifulSoup(raw_text, 'html.parser').find_all('p')
#          if x.find(text=True)]
    soup = BeautifulSoup(raw_text, 'html.parser')
    # getting rid of programming language identification for now,
    # could rely on guesslang package but conflict with tensorflow installed version and 
    # don't wanna downgrade, therefore code stored in dedicated variable for futur use.
    code = ""
    for s in soup.find_all('code'):
        code = code + s.get_text()
        s.extract()
    return re.sub(r'\n+', ' ', soup.get_text()).strip(), re.sub(r'\n+', ' ', code).strip()
        
def apply_tags(raw_text):
    return [ tag.name for tag in BeautifulSoup(raw_text, 'html.parser').find_all() if tag.name in valid_tags ]

In [4]:
df_tags = pd.read_csv(data_path + tags_filename)
valid_tags = df_tags['TagName'].tolist()
if init:
    df_base = pd.read_csv(data_path + data_filename)
    df_base['Bs4_Body'], df_base['Code'] = zip(*df_base['Body'].apply(apply_bs4))
    df_base['Bs4_Tags'] = df_base['Tags'].apply(apply_tags)

    df_base.to_pickle(data_path + 'bs4.pkl')
else:
    df_base = pd.read_pickle(data_path + 'bs4.pkl')

In [5]:
def nltk_post_tokens(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
    title_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Title'].lower()) if w not in nltk_stop_words ] 
    body_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Bs4_Body'].lower()) if w not in nltk_stop_words ]
    return title_tokens, body_tokens

In [6]:
init = False
if init:
    tokenizer = RegexpTokenizer(r'\w+')
    nltk_stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")

    df_nltk = pd.DataFrame()
    df_nltk = df_base.apply(nltk_post_tokens, axis=1, result_type='expand')
    df_nltk.columns=['Title', 'Body']
    df_nltk.to_pickle(data_path + 'nltk_token_stem.pkl')
else:
    df_nltk = pd.read_pickle(data_path + 'nltk_token_stem.pkl')

In [7]:
df_nltk

Unnamed: 0,Title,Body
0,"[determin, smallest, common, divisor, string]","[ask, follow, question, job, interview, stump,..."
1,"[use, trait, object, contain, method, return, ...","[correct, way, use, trait, object, contain, me..."
2,"[name, function, declar, hoist, put, insid, re...","[put, function, declar, besid, return, stateme..."
3,"[get, valu, input, caus, flicker, react]","[attempt, track, valu, two, login, input, fiel..."
4,"[assign, vector, atom, type]","[assign, member, vector, atom, type, https, wa..."
...,...,...
49995,"[nullreferenceexcept, simpleinjector, contain,...","[still, process, get, wpf, also, decid, give, ..."
49996,"[add, grid, line, catplot, seaborn]","[add, grid, line, vertic, horizont, catplot, f..."
49997,"[standard, deviat, data, meet, two, condit, ex...","[tri, calcul, standard, deviat, data, point, m..."
49998,"[proper, rais, except, tensorflow, graph]","[want, rais, except, depend, valu, input, tens..."


In [10]:
def filter_top_tags(df, column, count=10, default=None):
    top_tags = mlprepare.prepare_multi_label(df,'Bs4_Tags')[:count]
    ret = df[column].apply(lambda cell: [x for x in cell if x in top_tags])
    if default:
        ret = ret.apply(lambda cell: cell if len(cell)>0 else ['other'])
    return ret

In [12]:
create_baseline = True
df_base = pd.read_pickle(data_path + 'bs4.pkl')
if(create_baseline):
    df_top = pd.DataFrame()
    df_top['Tags_T100'] = filter_top_tags(df_base,'Bs4_Tags', count=100, default='other')
    df_top['Tags_T50'] = filter_top_tags(df_base,'Bs4_Tags', count=50, default='other')
    df_top['Tags_T10'] = filter_top_tags(df_base,'Bs4_Tags', count=10, default='other')
    df_top.to_pickle(data_path + 'Top_Tags.pkl')
else:
    df_top = pd.read_pickle(data_path + 'Top_Tags.pkl')

In [31]:
df_top[2500:2520]

Unnamed: 0,Tags_T100,Tags_T50,Tags_T10
2500,"[python, python-3.x]","[python, python-3.x]","[python, python-3.x]"
2501,[asp.net-core],[asp.net-core],[other]
2502,[c#],[c#],[c#]
2503,"[javascript, reactjs]","[javascript, reactjs]",[javascript]
2504,[other],[other],[other]
2505,"[python, python-3.x]","[python, python-3.x]","[python, python-3.x]"
2506,[python-3.x],[python-3.x],[python-3.x]
2507,[c++],[c++],[c++]
2508,[scala],[other],[other]
2509,"[sql, sql-server, tsql]","[sql, sql-server]",[sql]


In [29]:
df_base['Tags'][2500:2520]

2500                       <python><python-3.x><wxpython>
2501                  <asp.net-core><api-gateway><ocelot>
2502                             <c#><.net><asynchronous>
2503                                <javascript><reactjs>
2504    <eclipse><grails><groovy><eclipse-plugin><grai...
2505                           <python><python-3.x><kivy>
2506    <python-3.x><neural-network><pytorch><conv-neu...
2507                                  <c++><new-operator>
2508                         <scala><future><retry-logic>
2509                              <sql><sql-server><tsql>
2510                                                  <r>
2511            <c#><multithreading><console-application>
2512                                     <python><django>
2513              <.net><.net-core><dependency-injection>
2514      <javascript><google-apps-script><google-sheets>
2515    <amazon-web-services><amazon-ec2><aws-code-dep...
2516    <azure><azure-functions><azure-web-app-service...
2517          

In [15]:
def prepare_baseline_ys(df, tags, col):
    ret =  dict({})
    for tag in tags.index:
        sr_tag = df[col].copy(deep=True)
        sr_tag = sr_tag.apply(lambda x: 1 if tag in x else 0)
        ret[tag] = sr_tag
    return ret

tags_10 = mlprepare.prepare_multi_label(df_top,'Tags_T10')
y_dict_10 = prepare_baseline_ys(df_top, tags_10, 'Tags_T10')
dump(y_dict_10, data_path + 'y_dict_top10.joblib')

tags_50 = mlprepare.prepare_multi_label(df_top,'Tags_T50')
y_dict_50 = prepare_baseline_ys(df_top, tags_50, 'Tags_T50')
dump(y_dict_50, data_path + 'y_dict_top50.joblib')

tags_100 = mlprepare.prepare_multi_label(df_top,'Tags_T100')
y_dict_100 = prepare_baseline_ys(df_top, tags_100, 'Tags_T100')
dump(y_dict_100, data_path + 'y_dict_top100.joblib')

['..\\data\\stackoverflow\\y_dict_top100.joblib']

In [47]:
y_dict_10['azure'][2516]

KeyError: 'azure'

In [48]:
y_dict_50['azure'][2516]

1

In [49]:
y_dict_100['azure'][2516]

1

In [16]:
title_corpus = [' '.join(map(str, w)) for w in df_nltk['Title'].tolist()]
dump(title_corpus, data_path + 'title_corpus.joblib')

title_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
title_vectorizer.fit(title_corpus)
dump(title_vectorizer, model_path + 'title_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\title_vectorizer.joblib']

In [17]:
body_corpus = [' '.join(map(str, w)) for w in df_nltk['Body'].tolist()]
dump(body_corpus, data_path + 'body_corpus.joblib')

body_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
body_vectorizer.fit(body_corpus)
dump(body_vectorizer, model_path + 'body_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\body_vectorizer_11.joblib']

In [18]:
body_vectorizer2 = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
body_vectorizer2.fit(body_corpus)
dump(body_vectorizer2, model_path + 'body_vectorizer_12.joblib')

['..\\data\\stackoverflow\\models\\body_vectorizer_12.joblib']

In [19]:
body_vectorizer3 = TfidfVectorizer(ngram_range=(1,3), max_features=4000)
body_vectorizer3.fit(body_corpus)
dump(body_vectorizer3, model_path + 'body_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\body_vectorizer_13.joblib']

In [10]:
mlprepare.prepare_multi_label(df_nltk,'Body')[:30]

use         32185
tri         24186
code        22199
like        22120
want        17917
get         17646
work        15647
would       13386
valu        13320
follow      13187
data        12656
function    11427
error       11036
file        10899
way         10709
one         10314
need         9938
creat        9697
exampl       9566
column       9550
1            9338
2            7832
look         7775
output       7385
run          7346
list         7312
problem      7141
thank        7139
help         6846
result       6797
dtype: int64

In [12]:
mlprepare.prepare_multi_label(df_nltk,'Tags')[:50]

python           8882
javascript       5558
r                4045
pandas           3300
java             2917
c++              2688
c#               2632
python-3.x       2173
sql              2032
html             1704
dataframe        1571
reactjs          1550
arrays           1485
c                1362
css              1215
regex            1172
android          1119
php              1093
typescript       1059
swift            1023
node.js           982
mysql             890
angular           875
jquery            872
numpy             837
json              805
list              733
postgresql        687
ios               653
mongodb           651
dplyr             640
django            630
sql-server        621
swiftui           607
string            593
powershell        585
vue.js            557
azure             551
ggplot2           537
flutter           517
bash              499
git               496
excel             492
firebase          481
asp.net-core      477
google-she

In [16]:
mlprepare.prepare_multi_label(df_nltk,'Body').count()

4290

In [44]:
df_nltk.loc[51]

Title         [creating, symmetric, array, power, element]
Body     [trying, create, array, symmetric, elements, p...
Tags          [python, numpy, for-loop, matrix, symmetric]
Name: 51, dtype: object

In [7]:
df_base[df_base['Id']==59551201].index
df_base.loc[51,'Body']

'<p>I am trying to create an array which is symmetric with elements placed as below</p>\n\n<p><a href="https://i.stack.imgur.com/QzkLo.gif" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/QzkLo.gif" alt="correct_pic"></a></p>\n\n<p>I have written the following code to get this form with parameter being 0.5 and dimension being <code>4-by-4</code>.</p>\n\n<pre class="lang-py prettyprint-override"><code>import numpy as np\na = np.eye(4)\nfor i in range(4):\n    for j in range(4):\n        a[i, j] = (0.5) ** (np.abs(i-j))\n</code></pre>\n\n<p>This does what I need but for large dimension (1000s) this causes a lot of overhead. Is there any other low complexity method to get this matrix? Thanks.</p>\n'

In [24]:
df_base.loc[51]

Id                                                59551201
Title    Creating a symmetric array with power of an el...
Body     <p>I am trying to create an array which is sym...
Tags          <python><numpy><for-loop><matrix><symmetric>
bf4      I am trying to create an array which is symmet...
Name: 51, dtype: object

In [14]:
df_base

Unnamed: 0,Id,Title,Body,Tags,bf4
0,59549234,How to determine the smallest common divisor o...,<p>I was asked the following question during a...,<string><algorithm>,I was asked the following question during a jo...
1,59549281,How to use a trait object that contains a meth...,<p>What's the correct way to use a trait objec...,<rust>,What's the correct way to use a trait object t...
2,59549471,Why a named function declaration isn't hoisted...,<p>is putting the function declaration beside ...,<javascript>,is putting the function declaration beside the...
3,59549568,Getting value of input causing flickering in R...,<p>I am attempting to track the value of two l...,<javascript><css><reactjs><react-hooks><react-...,I am attempting to track the value of two logi...
4,59549600,How to assign a vector of atomic types?,<p>How can I assign the members of a vector wi...,<c++><multithreading><c++11><stdvector><stdato...,How can I assign the members of a vector with ...
...,...,...,...,...,...
95,59552693,Handle Authentication in ajax call from MVC cl...,"<p>I use Identity server 4 for Authentication,...",<ajax><asp.net-core><asp.net-core-webapi><iden...,"I use Identity server 4 for Authentication, On..."
96,59552721,Unrecognized option or bad number of args for:...,"<p>When I run <code>redis-cli script load ""$(c...",<redis><lua><redis-cli>,When I run . I get a sha . How is this possibl...
97,59552867,Instagram login with tor,<p>i am trying to login to instagram through t...,<python-3.x><tor>,i am trying to login to instagram through tor\...
98,59552875,Nodejs 12 Callback not working for mysql conne...,<p>I'm writing this code to run in AWS Lambda....,<mysql><node.js><aws-lambda>,I'm writing this code to run in AWS Lambda. ....


In [4]:
print(spacy.__version__)

3.0.5


In [6]:
nlp = spacy.load("en_core_web_sm")

In [33]:
# doc = nlp(df_base.loc[51,'bf4'])
doc_title = nlp(df_base['Title'][:10000].str.cat(sep='. '))


In [8]:
nlp.max_length=2500000
doc_body = nlp(df_base['bs4'][:4000].str.cat(sep='. '))

In [32]:
doc_title[50].vector

array([-0.91, -0.64, -0.78, -0.07,  0.92,  1.59, -0.02, -0.31,  2.27,
        0.29, -0.74,  1.4 , -0.51, -0.34,  0.77, -0.23, -0.1 , -0.03,
        1.99, -0.27, -0.72, -0.6 , -1.19,  0.11, -0.47, -0.7 , -0.33,
       -0.05, -0.34, -0.65, -0.29,  0.17,  0.  ,  0.02, -0.71, -0.08,
        0.13, -0.35,  0.59,  0.14,  1.25,  1.34, -0.42, -0.12, -0.89,
        0.33, -1.06, -0.62, -0.67, -0.07, -0.15, -0.38,  0.07, -0.06,
       -1.16,  1.15, -0.27, -0.23, -0.82,  0.91, -0.74,  0.23,  0.99,
       -0.93,  1.03,  0.6 ,  1.06, -0.79, -0.69,  1.85,  0.82, -0.77,
       -0.63, -1.13,  0.29, -1.06, -1.09,  0.66, -0.75,  0.89,  0.16,
        0.27,  0.49,  1.62, -0.04, -1.46,  0.09,  0.48, -0.96, -0.98,
       -0.83,  1.54,  0.95, -0.59,  1.71, -0.11], dtype=float32)

In [9]:
doc_body[50].vector

array([-0.4 , -0.73,  1.05, -0.05, -0.53,  0.06,  0.17,  0.51, -1.07,
        0.  , -0.83,  0.41, -0.42, -0.71, -0.96, -0.17, -1.77,  1.46,
        0.56, -1.22, -0.69, -0.9 ,  0.56,  0.21,  0.48, -1.16, -0.24,
       -1.09, -0.05, -0.17,  0.28,  0.35,  0.98,  0.69, -0.48,  0.06,
        1.22, -1.19, -0.57,  0.91,  0.68,  1.24, -0.  , -0.7 ,  0.28,
       -0.84,  2.12, -0.59, -0.42, -0.44,  0.43,  0.69, -0.33, -0.11,
       -0.28,  1.11,  0.97, -0.28,  1.74, -0.44, -0.09, -0.26,  0.19,
        0.3 ,  1.14,  1.26, -0.43, -1.39, -0.88, -0.52,  0.64,  1.3 ,
       -0.74, -0.08, -0.46,  0.43, -0.17, -1.53, -0.  ,  0.55, -0.48,
       -0.01,  0.18,  0.8 , -0.59,  0.04, -0.99,  0.22,  1.15, -0.64,
       -0.18,  0.37, -1.28,  0.57,  0.24, -0.28], dtype=float32)

In [27]:
print("Noun phrases:", [chunk.text for chunk in doc_title.noun_chunks])



In [26]:
print("Verbs:", [token.lemma_ for token in doc_title if token.pos_ == "VERB"])

Verbs: ['determine', 'use', 'contain', 'return', 'name', 'hoist', 'put', 'get', 'cause', 'assign', 'get', 'use', 'get', 'store', 'import', 'have', 'get', 'work', 'use', 'filter', 'use', 'use', 'reject', 'capture', 'parse', 'move', 'work', 'fix', 'wrap', 'use', 'domnodeinserte', 'fopen', 'expand', 'use', 'set', 'get', 'understand', 'point', 'commit', 'use', 'nest', 'add', 'excel', 'use', 'handle', 'include', 'have', 'configure', 'login', 'react', 'convert', 'list', 'make', 'save', 'use', 'combine', 'separate', 'melt', 'define', 'specify', 'convert', 'move', 'get', 'give', 'get', 'match', 'use', 'roll', 'avoid', 'generate', 'use', 'intersect', 'draw', 'expect', 'attribute', 'use', 'check', 'reduce', 'return', 'pseudo', 'use', 'create', 'show', 'work', 'expect', 'store', 'join', 'use', 'df.mode', 'init', 'make', 'behave', 'hide', 'show', 'extract', 'contain', 'use', 'connect', 'send', 'use', 'use', 'select', 'recognize', 'remove', 're', '-', 'add', 'build', 'sort', 'get', 'include', 'work

In [27]:
print("Noun phrases:", [chunk.text for chunk in doc_body.noun_chunks])



In [28]:
print("Verbs:", [token.lemma_ for token in doc_body if token.pos_ == "VERB"])

Verbs: ['ask', 'stump', 'make', 'solve', 'think', 'realize', 'ask', 'solve', 'figure', 'contain', 'find', 'be', 'be', 'be', 'use', 'contain', 'return', 'give', 'resolve', 'modify', 'modify', 'say', 'understand', 'use', 'use', 'put', 'cause', 'hoist', 'put', 'turn', 'be', 'hoist', 'attempt', 'track', 'get', 'store', 'submit', 'get', 'result', 'type', 'type', 'jump', 'observe', 'seem', 'happen', 'type', 'follow', 'situate', 'display', 'use', 'go', 'try', 'use', 'seem', 'work', 'appreciate', 'assign', 'have', 'extract', 'extract', 'find', 'output', 'need', 'be', 'be', 'do', 'think', 'make', 'store', 'have', 'think', 'put', 'have', 'have', 'have', 'have', 'work', 'come', 'firestore', 'understand', 'check', 'be', 'check', 'change', 'let', 'say', 'assume', 'modify', 'be', 'prevent', 'replace', 'want', 'find', 'let', 'update', 'know', 'go', 'modify', 'know', 'have', 'be', 'hear', 'import', 'be', 'import', 'be', 'be', 'make', 'sit', 'be', 'use', 'try', 'get', 'work', 'follow', 'work', 'try', '

In [29]:
for entity in doc_title.ents:
    print(entity.text, entity.label_)

React GPE
first ORDINAL
CRAN ORG
CRAN ORG
SQL ORG
WHERE & ' ORG
Python3 GPE
20 CARDINAL
PHP ORG
Move Input ORG
Outputs PERSON
jQuery ORG
Google Test C. How LAW
Pandas NORP
Linkedlist ORG
Git PERSON
Microk8s GPE
SQL Server ORG
2014 DATE
1 CARDINAL
Flow<List<T>> to List<T> ORG
second ORDINAL
Kotlin GPE
HashMap ORG
JSON & GSON ORG
Two CARDINAL
One CARDINAL
FFMPEG ORG
PK GPE
Gradient Arc View Not Showing Correctly PRODUCT
Django PERSON
Kafka PERSON
Aurora PERSON
MSK ORG
first ORDINAL
each month DATE
Remove Git LFS PERSON
0.229 CARDINAL
Toolbar GPE
ViewModel ORG
Matplotlib PERSON
7 CARDINAL
8 CARDINAL
Firebase ORG
close to CARDINAL
20 CARDINAL
two CARDINAL
UTC ORG
NotificationCenter ORG
Laravel ORG
only one CARDINAL
3.1 CARDINAL
12h DATE
MVC ORG
4 CARDINAL
12 CARDINAL


In [30]:
for entity in doc_body.ents:
    print(entity.text, entity.label_)

two CARDINAL
first ORDINAL
one CARDINAL
second ORDINAL
at least one CARDINAL
two CARDINAL
two CARDINAL
two CARDINAL
0 CARDINAL
1 CARDINAL
24 CARDINAL
2 CARDINAL
35 CARDINAL
1 CARDINAL
24 CARDINAL
the Comprehensive R Archive Network ORG
CRAN ORG
CRAN ORG
R. NORP
1 MONEY
OUTPUT ORG
2 MONEY
2 MONEY
OUTPUT ORG
3 MONEY
3 MONEY
over 3 hours TIME
PHP ORG
2 CARDINAL
3 CARDINAL
Inputs NORP
Outputs of a Parent Component ORG
Typescript GPE
Parent GPE
Child Component ORG
Typescript GPE
Pandas NORP
two CARDINAL
7 days DATE
7 days DATE
the remaining days DATE
Pandas NORP
Datanovice ORG
Laravel GPE
2 CARDINAL
3 CARDINAL
& Value ORG
2 CARDINAL
SHELL ORG
1 CARDINAL
2 CARDINAL
Output 1 PRODUCT
1 CARDINAL
2 CARDINAL
C11 ORG
zero CARDINAL
zero CARDINAL
zero CARDINAL
second ORDINAL
SQL Server ORG
2014 DATE
A few days ago DATE
SQL Server Management Studio ORG
SQL Server Management Studio ORG
Login Failed PERSON
A. Reason PERSON
Only one CARDINAL
Microsoft SQL Server ORG
18461 DATE
two CARDINAL
75% PERCENT
2

In [44]:
doc = nlp(df_base['Title'].str.cat())

In [32]:
set([token.pos_ for token in doc_title ])

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X'}

In [15]:
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Noun phrases: ['I', 'an array', 'elements', 'I', 'the following code', 'this form', 'parameter', 'dimension', 'being', 'what', 'I', 'large dimension', 'a lot', 'overhead', 'any other low complexity method', 'this matrix', 'Thanks']


In [16]:
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Verbs: ['try', 'create', 'be', 'place', 'write', 'follow', 'get', 'be', 'do', 'need', 'cause', 'get']
