In [1]:
import importlib
import os
from joblib import dump, load
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk import download as nltk_download
from nltk import tokenize, RegexpTokenizer, pos_tag
from nltk.corpus import stopwords, wordnet
'''Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.'''
from nltk.stem import WordNetLemmatizer
nltk_download('wordnet') 
nltk_download('punkt')
nltk_download('stopwords')
nltk_download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import ml.explo as mlexplo
import ml.prepare as mlprepare

from nltk.stem.snowball import SnowballStemmer

# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

data_path = "..\\data\\stackoverflow\\"
model_path = data_path + "models\\"

np.set_printoptions(precision=2, suppress=True)
pd.options.display.float_format = "{:,.2f}".format

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# data_filename = "OC_DS_P6.csv"
data_csv_1 = 'OC_ML_P5_2006_2009.csv'
data_csv_2 = 'OC_ML_P5_2010_2102.csv'
data_csv_test = 'OC_ML_P5_TEST.csv' 
tags_filename = "Stackoverflow_top_tags.csv"

# Parsing

In [3]:
def apply_bs4(raw_text):
#     return '. '.join(
#         [x.find(text=True)
#          for x in BeautifulSoup(raw_text, 'html.parser').find_all('p')
#          if x.find(text=True)]
    soup = BeautifulSoup(raw_text, 'html.parser')
    # getting rid of programming language identification for now,
    # could rely on guesslang package but conflict with tensorflow installed version and 
    # don't wanna downgrade, therefore code stored in dedicated variable for futur use.
    code = ""
    for s in soup.find_all('code'):
        code = code + s.get_text()
        s.extract()
    return re.sub(r'\n+', ' ', soup.get_text()).strip(), re.sub(r'\n+', ' ', code).strip()
        
def apply_tags(raw_text):
    return [ tag.name for tag in BeautifulSoup(raw_text, 'html.parser').find_all() if tag.name in valid_tags ]

In [4]:
df_tags = pd.read_csv(data_path + tags_filename)
valid_tags = df_tags['TagName'].tolist()
nltk_stop_words = set(stopwords.words('english') + 
    # Remove after stemming and lemmatization top words frequencies analysis
    ['use', 'try', 'way', 'want', 'would', 'need', 'one', '1', '2' ] +
    # Remove after NMF analysis
    ['something', 'desire', 'follow ', 'help', 'please', 'however', 'follow', 'code' ])


init_bs4 = False
if init_bs4:
    df_csv_1 = pd.read_csv(data_path + data_csv_1)
    df_csv_1['Bs4_Body'], df_csv_1['Code'] = zip(*df_csv_1['Body'].apply(apply_bs4))
    df_csv_1['Bs4_Tags'] = df_csv_1['Tags'].apply(apply_tags)

    df_csv_2 = pd.read_csv(data_path + data_csv_2)
    df_csv_2['Bs4_Body'], df_csv_2['Code'] = zip(*df_csv_2['Body'].apply(apply_bs4))
    df_csv_2['Bs4_Tags'] = df_csv_2['Tags'].apply(apply_tags)
    
    df_bs4_test = pd.read_csv(data_path + data_csv_test)
    df_bs4_test['Bs4_Body'], df_bs4_test['Code'] = zip(*df_bs4_test['Body'].apply(apply_bs4))
    df_bs4_test['Bs4_Tags'] = df_bs4_test['Tags'].apply(apply_tags)

    df_bs4 = pd.concat([df_csv_1,df_csv_2], ignore_index=True)
    df_bs4.to_pickle(data_path + 'bs4.pkl')
    df_bs4_test.to_pickle(data_path + 'bs4_test.pkl')
else:
    df_bs4 = pd.read_pickle(data_path + 'bs4.pkl')
    df_bs4_test = pd.read_pickle(data_path + 'bs4_test.pkl')

# Labelling

In [5]:
def filter_top_tags(df, column, count=10, default=None):
    top_tags = mlprepare.prepare_multi_label(df_full_tags,'Bs4_Tags')[:count]
    ret = df[column].apply(lambda cell: [x for x in cell if x in top_tags])
    if default:
        ret = ret.apply(lambda cell: cell if len(cell)>0 else ['other'])
    return ret

In [6]:
init_labels = False
df_bs4 = pd.read_pickle(data_path + 'bs4.pkl')
df_bs4_test = pd.read_pickle(data_path + 'bs4_test.pkl')
df_full_tags = pd.DataFrame(pd.concat([df_bs4['Bs4_Tags'],df_bs4_test['Bs4_Tags']]))

if(init_labels):
    df_top = pd.DataFrame()
    df_top['Tags_T100'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=100, default='other')
    df_top['Tags_T50'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=50, default='other')
    df_top['Tags_T10'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=10, default='other')
    df_top.to_pickle(data_path + 'Top_Tags.pkl')

    df_top_test = pd.DataFrame()
    df_top_test['Tags_T100'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=100, default='other')
    df_top_test['Tags_T50'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=50, default='other')
    df_top_test['Tags_T10'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=10, default='other')
    df_top_test.to_pickle(data_path + 'Top_Tags_test.pkl')

else:
    df_top = pd.read_pickle(data_path + 'Top_Tags.pkl')
    df_top_test = pd.read_pickle(data_path + 'Top_Tags_test.pkl')

# Stemming

In [7]:
def nltk_stop_stemmer(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
    title_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Title'].lower()) if stemmer.stem(w) not in nltk_stop_words ] 
    body_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Bs4_Body'].lower()) if stemmer.stem(w) not in nltk_stop_words ]
    return title_tokens, body_tokens

In [8]:
init_stem = False
if init_stem:
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    df_stop_stem = pd.DataFrame()
    df_stop_stem = df_bs4.apply(nltk_stop_stemmer, axis=1, result_type='expand')
    df_stop_stem.columns=['Title', 'Body']
    df_stop_stem.to_pickle(data_path + 'nltk_stop_stem.pkl')

    df_stop_stem_test = pd.DataFrame()
    df_stop_stem_test = df_bs4_test.apply(nltk_stop_stemmer, axis=1, result_type='expand')
    df_stop_stem_test.columns=['Title', 'Body']
    df_stop_stem_test.to_pickle(data_path + 'nltk_stop_stem_test.pkl')
else:
    df_stop_stem = pd.read_pickle(data_path + 'nltk_stop_stem.pkl')
    df_stop_stem_test = pd.read_pickle(data_path + 'nltk_stop_stem_test.pkl')

In [9]:
df_top.shape

(96420, 3)

In [12]:
[' '.join(map(str, w)) for w in df_stop_stem['Title'].tolist()]

['panda pivot datafram valu function two column',
 'use polymorph function paramet',
 'loop list dictionari condit key exist',
 'use cpqueri function sever pair dataset',
 'split number delimit non numer string',
 'increas clickabl area imageview',
 'make work function insid statement',
 'check row panda seri contain string list use appli',
 'zero enum valu indic invalid valu',
 'tri segment charact use opencv ilumin problem',
 'add new key dictionari base condit match two exist dict key valu python',
 'voxel cone trace defer pipelin',
 'assign empti initi vector unique_ptr',
 'custom legend marker size matplotlib use lambda function',
 'show proccess output text box frame use python tkinter',
 'restrict s3 sub folder level bucket polici privat public',
 'mousedown event trigger mouseup event',
 'python nan return panda resampl function',
 'wait ajax respons',
 'convert list insid tupl ad tupl',
 'find first dupe across axi base key row',
 'chang window size new window termin',
 'googl

# Lemmatization

In [10]:
wn = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [11]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def nltk_lemmatizer_stop(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
    title_tokens = [ lemma for lemma in 
            [ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(row['Title'].lower()) ]
            if lemma not in nltk_stop_words ] 
    body_tokens = [ lemma for lemma in 
            [ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(row['Bs4_Body'].lower()) ]
            if lemma not in nltk_stop_words ]
    return title_tokens, body_tokens

In [12]:
phrase = "js and css files extentions are naked leaving the washing room"
[ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(phrase.lower()) ]

['j',
 'and',
 'cs',
 'file',
 'extentions',
 'be',
 'naked',
 'leave',
 'the',
 'wash',
 'room']

In [13]:
init_lemma = False
if init_lemma:
    tokenizer = RegexpTokenizer(r'\w+')
#     stemmer = SnowballStemmer("english")
    wn = WordNetLemmatizer()

    df_lemma_stop = pd.DataFrame()
    df_lemma_stop = df_bs4.apply(nltk_lemmatizer_stop, axis=1, result_type='expand')
    df_lemma_stop.columns=['Title', 'Body']
    df_lemma_stop.to_pickle(data_path + 'nltk_lemma_stop.pkl')

    df_lemma_stop_test = pd.DataFrame()
    df_lemma_stop_test = df_bs4_test.apply(nltk_lemmatizer_stop, axis=1, result_type='expand')
    df_lemma_stop_test.columns=['Title', 'Body']
    df_lemma_stop_test.to_pickle(data_path + 'nltk_lemma_stop_test.pkl')
else:
    df_lemma_stop = pd.read_pickle(data_path + 'nltk_lemma_stop.pkl')
    df_lemma_stop_test = pd.read_pickle(data_path + 'nltk_lemma_stop_test.pkl')

In [12]:
c = 211
[ (i,j) for i,j in zip(df_stop_stem.loc[c,'Body'],df_lemma_stop.loc[c,'Body']) if i != j ]

[('veri', 'fast'),
 ('fast', 'threadsafe'),
 ('threadsaf', 'whatever'),
 ('whatev', 'reason'),
 ('reason', 'c'),
 ('c', 'seem'),
 ('seem', 'require'),
 ('requir', 'construct'),
 ('construct', 'functor'),
 ('functor', 'every'),
 ('everi', 'time'),
 ('time', 'hash'),
 ('hash', 'value'),
 ('valu', 'quite'),
 ('quit', 'concerned'),
 ('concern', 'overhead'),
 ('overhead', 'allocate'),
 ('alloc', 'entire'),
 ('entir', 'struct'),
 ('struct', 'every'),
 ('everi', 'time'),
 ('time', 'hash'),
 ('hash', 'value'),
 ('valu', 'understand'),
 ('understand', 'actual'),
 ('actual', 'reason'),
 ('reason', 'functor'),
 ('functor', 'even'),
 ('even', 'necessary'),
 ('necessari', 'context'),
 ('context', 'safe'),
 ('safe', 'correct'),
 ('correct', 'create'),
 ('creat', 'hash'),
 ('hash', 'struct'),
 ('struct', 'call'),
 ('call', 'multiple'),
 ('multipl', 'different'),
 ('differ', 'time'),
 ('time', 'hash'),
 ('hash', 'struct'),
 ('struct', 'go'),
 ('go', 'threadsafe'),
 ('threadsaf', 'make'),
 ('make', 'th

# Corpus build

In [14]:
build_corpus=False

In [15]:
if build_corpus:
    stem_title_corpus = [' '.join(map(str, w)) for w in df_stop_stem['Title'].tolist()]
    stem_title_corpus_test = [' '.join(map(str, w)) for w in df_stop_stem_test['Title'].tolist()]
    stem_body_corpus = [' '.join(map(str, w)) for w in df_stop_stem['Body'].tolist()]
    stem_body_corpus_test = [' '.join(map(str, w)) for w in df_stop_stem_test['Body'].tolist()]
    dump(stem_title_corpus, data_path + 'stem_title_corpus.joblib')
    dump(stem_title_corpus_test, data_path + 'stem_title_corpus_test.joblib')
    dump(stem_body_corpus, data_path + 'stem_body_corpus.joblib')
    dump(stem_body_corpus_test, data_path + 'stem_body_corpus_test.joblib')
else:
    stem_title_corpus = load(data_path + 'stem_title_corpus.joblib')
    stem_title_corpus_test =  load(data_path + 'stem_title_corpus_test.joblib')
    stem_body_corpus = load(data_path + 'stem_body_corpus.joblib')
    stem_body_corpus_test = load(data_path + 'stem_body_corpus_test.joblib')

In [16]:
if build_corpus:
    lemma_title_corpus = [' '.join(map(str, w)) for w in df_lemma_stop['Title'].tolist()]
    lemma_title_corpus_test = [' '.join(map(str, w)) for w in df_lemma_stop_test['Title'].tolist()]
    lemma_body_corpus = [' '.join(map(str, w)) for w in df_lemma_stop['Body'].tolist()]
    lemma_body_corpus_test = [' '.join(map(str, w)) for w in df_lemma_stop_test['Body'].tolist()]
    dump(lemma_title_corpus, data_path + 'lemma_title_corpus.joblib')
    dump(lemma_title_corpus_test, data_path + 'lemma_title_corpus_test.joblib')
    dump(lemma_body_corpus, data_path + 'lemma_body_corpus.joblib')
    dump(lemma_body_corpus_test, data_path + 'lemma_body_corpus_test.joblib')
else:
    lemma_title_corpus = load(data_path + 'lemma_title_corpus.joblib')
    lemma_title_corpus_test = load(data_path + 'lemma_title_corpus_test.joblib')
    lemma_body_corpus = load(data_path + 'lemma_body_corpus.joblib')
    lemma_body_corpus_test = load(data_path + 'lemma_body_corpus_test.joblib')
    

# Count Vectorizer 

In [17]:


title_count_vectorizer = CountVectorizer(ngram_range=(1,1), max_features=1000)
title_count_vectorizer.fit(lemma_title_corpus)
dump(title_count_vectorizer, model_path + 'lemma_title_count_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\lemma_title_count_vectorizer.joblib']

In [18]:


body_count_vectorizer = CountVectorizer(ngram_range=(1,1), max_features=5000)
body_count_vectorizer.fit(lemma_body_corpus)
dump(body_count_vectorizer, model_path + 'lemma_body_count_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_count_vectorizer_11.joblib']

In [19]:
body_count_vectorizer2 = CountVectorizer(ngram_range=(1,2), max_features=5000)
body_count_vectorizer2.fit(lemma_body_corpus)
dump(body_count_vectorizer2, model_path + 'lemma_body_count_vectorizer_12.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_count_vectorizer_12.joblib']

In [20]:
body_count_vectorizer3 = CountVectorizer(ngram_range=(1,3), max_features=5000)
body_count_vectorizer3.fit(lemma_body_corpus)
dump(body_count_vectorizer3, model_path + 'lemma_body_count_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_count_vectorizer_13.joblib']

# TFIDF Stem Vectorization

In [21]:
title_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
title_vectorizer.fit(stem_title_corpus)
dump(title_vectorizer, model_path + 'stem_title_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\stem_title_vectorizer.joblib']

In [22]:
body_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
body_vectorizer.fit(stem_body_corpus)
dump(body_vectorizer, model_path + 'stem_body_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\stem_body_vectorizer_11.joblib']

In [23]:
body_vectorizer2 = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
body_vectorizer2.fit(stem_body_corpus)
dump(body_vectorizer2, model_path + 'stem_body_vectorizer_12.joblib')

['..\\data\\stackoverflow\\models\\stem_body_vectorizer_12.joblib']

In [24]:
body_vectorizer3 = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
body_vectorizer3.fit(stem_body_corpus)
dump(body_vectorizer3, model_path + 'stem_body_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\stem_body_vectorizer_13.joblib']

In [18]:
df_bs4.loc[75,'Body']

'<p>I am having two dataframe like described below</p>\n\n<p><strong>Dataframe 1</strong></p>\n\n<pre><code>P_ID     P_Name     P_Description   P_Size\n\n100      Moto          Mobile         16\n\n200      Apple         Mobile         15\n\n300      Oppo          Mobile         18\n</code></pre>\n\n<p><strong>Dataframe 2</strong></p>\n\n<pre><code>P_ID     List_Code      P_Amount     \n\n100      ALPHA           20000         \n\n100      BETA            60000  \n\n300      GAMMA           15000    \n</code></pre>\n\n<p><strong>Requirement :</strong>\nNeed to join the two dataframe by P_ID. </p>\n\n<p><strong>Information about the dataframe :</strong>\nIn dataframe 1 P_ID is a primary key and dataframe 2 does\'t have any primary attribute.  </p>\n\n<p><strong>How to join the dataframe</strong>\nNeed to create new columns in dataframe 1 from the value of dataframe 2 List_Code appends with "_price". If dataframe 2 List_Code contains 20 unique values we need to create 20 column in datafr

In [19]:
df_bs4.loc[75,'Bs4_Body']

'I am having two dataframe like described below Dataframe 1 Dataframe 2 Requirement : Need to join the two dataframe by P_ID.  Information about the dataframe : In dataframe 1 P_ID is a primary key and dataframe 2 does\'t have any primary attribute.   How to join the dataframe Need to create new columns in dataframe 1 from the value of dataframe 2 List_Code appends with "_price". If dataframe 2 List_Code contains 20 unique values we need to create 20 column in dataframe 1. Then, we have fill the value in newly created column in dataframe 1 from the dataframe 2 P_Amount column based on P_ID if present else fills with zero. After creation of dataframe we need to join the dataframe based on the P_ID. If we add the column with the expected value in dataframe 1 we can join the dataframe. My problem is creating new columns with the expected value.  The expected dataframe is shown below   Expected dataframe Can you please help me to solve the problem, thanks in advance.'

In [20]:
df_bs4.loc[75,'Code']

'P_ID     P_Name     P_Description   P_Size 100      Moto          Mobile         16 200      Apple         Mobile         15 300      Oppo          Mobile         18 P_ID     List_Code      P_Amount      100      ALPHA           20000          100      BETA            60000   300      GAMMA           15000        P_ID     P_Name     P_Description   P_Size   ALPHA_price   BETA_price    GAMMA_price     100      Moto          Mobile         16       20000       60000           0     200      Apple         Mobile         15         0            0            0     300      Oppo          Mobile         18         0            0           15000'

In [20]:
df_base = df_bs4[:10][['Title','Bs4_Body']]
tokenizer = RegexpTokenizer(r'\w+')

# TFIDF Lemma Vectorization

In [25]:

title_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
title_vectorizer.fit(lemma_title_corpus)
dump(title_vectorizer, model_path + 'lemma_title_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\lemma_title_vectorizer.joblib']

In [26]:
body_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
body_vectorizer.fit(lemma_body_corpus)
dump(body_vectorizer, model_path + 'lemma_body_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_vectorizer_11.joblib']

In [27]:
body_vectorizer2 = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
body_vectorizer2.fit(lemma_body_corpus)
dump(body_vectorizer2, model_path + 'lemma_body_vectorizer_12.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_vectorizer_12.joblib']

In [28]:
body_vectorizer3 = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
body_vectorizer3.fit(lemma_body_corpus)
dump(body_vectorizer3, model_path + 'lemma_body_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_vectorizer_13.joblib']

In [19]:
df_bs4.loc[1500,'Body']

'<p>I am trying to improve the animation performance of my custom modal view.</p>\n\n<p>By using .drawingGroup(), I\'ve managed to do so, but my TextField got disabled. Like so:</p>\n\n<p><a href="https://i.stack.imgur.com/gGWJIm.jpg" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/gGWJIm.jpg" alt="enter image description here"></a></p>\n\n<p>And I get the error: \'Unable to render flattened version of PlatformViewRepresentableAdaptor&lt;>\'</p>\n\n<p>Is there a way to fix this? </p>\n\n<p>Note: I am using a standard TextField.</p>\n\n<p>Thanks!</p>\n'

In [20]:
df_bs4.loc[1500,'Bs4_Body']

"I am trying to improve the animation performance of my custom modal view. By using .drawingGroup(), I've managed to do so, but my TextField got disabled. Like so: And I get the error: 'Unable to render flattened version of PlatformViewRepresentableAdaptor<>' Is there a way to fix this?  Note: I am using a standard TextField. Thanks!"

In [18]:
' '.join(df_stop_stem.loc[1500,'Body'])

'tri improv anim perform custom modal view drawinggroup manag textfield got disabl like get error unabl render flatten version platformviewrepresentableadaptor fix note standard textfield thank'

In [21]:
' '.join(df_lemma_stop.loc[1500,'Body'])

'improve animation performance custom modal view drawinggroup manage textfield get disabled like get error unable render flatten version platformviewrepresentableadaptor fix note standard textfield thanks'

In [4]:
print(spacy.__version__)

3.0.5


In [6]:
nlp = spacy.load("en_core_web_sm")

In [33]:
# doc = nlp(df_base.loc[51,'bf4'])
doc_title = nlp(df_base['Title'][:10000].str.cat(sep='. '))


In [8]:
nlp.max_length=2500000
doc_body = nlp(df_base['bs4'][:4000].str.cat(sep='. '))

In [32]:
doc_title[50].vector

array([-0.91, -0.64, -0.78, -0.07,  0.92,  1.59, -0.02, -0.31,  2.27,
        0.29, -0.74,  1.4 , -0.51, -0.34,  0.77, -0.23, -0.1 , -0.03,
        1.99, -0.27, -0.72, -0.6 , -1.19,  0.11, -0.47, -0.7 , -0.33,
       -0.05, -0.34, -0.65, -0.29,  0.17,  0.  ,  0.02, -0.71, -0.08,
        0.13, -0.35,  0.59,  0.14,  1.25,  1.34, -0.42, -0.12, -0.89,
        0.33, -1.06, -0.62, -0.67, -0.07, -0.15, -0.38,  0.07, -0.06,
       -1.16,  1.15, -0.27, -0.23, -0.82,  0.91, -0.74,  0.23,  0.99,
       -0.93,  1.03,  0.6 ,  1.06, -0.79, -0.69,  1.85,  0.82, -0.77,
       -0.63, -1.13,  0.29, -1.06, -1.09,  0.66, -0.75,  0.89,  0.16,
        0.27,  0.49,  1.62, -0.04, -1.46,  0.09,  0.48, -0.96, -0.98,
       -0.83,  1.54,  0.95, -0.59,  1.71, -0.11], dtype=float32)

In [9]:
doc_body[50].vector

array([-0.4 , -0.73,  1.05, -0.05, -0.53,  0.06,  0.17,  0.51, -1.07,
        0.  , -0.83,  0.41, -0.42, -0.71, -0.96, -0.17, -1.77,  1.46,
        0.56, -1.22, -0.69, -0.9 ,  0.56,  0.21,  0.48, -1.16, -0.24,
       -1.09, -0.05, -0.17,  0.28,  0.35,  0.98,  0.69, -0.48,  0.06,
        1.22, -1.19, -0.57,  0.91,  0.68,  1.24, -0.  , -0.7 ,  0.28,
       -0.84,  2.12, -0.59, -0.42, -0.44,  0.43,  0.69, -0.33, -0.11,
       -0.28,  1.11,  0.97, -0.28,  1.74, -0.44, -0.09, -0.26,  0.19,
        0.3 ,  1.14,  1.26, -0.43, -1.39, -0.88, -0.52,  0.64,  1.3 ,
       -0.74, -0.08, -0.46,  0.43, -0.17, -1.53, -0.  ,  0.55, -0.48,
       -0.01,  0.18,  0.8 , -0.59,  0.04, -0.99,  0.22,  1.15, -0.64,
       -0.18,  0.37, -1.28,  0.57,  0.24, -0.28], dtype=float32)

In [27]:
print("Noun phrases:", [chunk.text for chunk in doc_title.noun_chunks])



In [26]:
print("Verbs:", [token.lemma_ for token in doc_title if token.pos_ == "VERB"])

Verbs: ['determine', 'use', 'contain', 'return', 'name', 'hoist', 'put', 'get', 'cause', 'assign', 'get', 'use', 'get', 'store', 'import', 'have', 'get', 'work', 'use', 'filter', 'use', 'use', 'reject', 'capture', 'parse', 'move', 'work', 'fix', 'wrap', 'use', 'domnodeinserte', 'fopen', 'expand', 'use', 'set', 'get', 'understand', 'point', 'commit', 'use', 'nest', 'add', 'excel', 'use', 'handle', 'include', 'have', 'configure', 'login', 'react', 'convert', 'list', 'make', 'save', 'use', 'combine', 'separate', 'melt', 'define', 'specify', 'convert', 'move', 'get', 'give', 'get', 'match', 'use', 'roll', 'avoid', 'generate', 'use', 'intersect', 'draw', 'expect', 'attribute', 'use', 'check', 'reduce', 'return', 'pseudo', 'use', 'create', 'show', 'work', 'expect', 'store', 'join', 'use', 'df.mode', 'init', 'make', 'behave', 'hide', 'show', 'extract', 'contain', 'use', 'connect', 'send', 'use', 'use', 'select', 'recognize', 'remove', 're', '-', 'add', 'build', 'sort', 'get', 'include', 'work

In [27]:
print("Noun phrases:", [chunk.text for chunk in doc_body.noun_chunks])



In [28]:
print("Verbs:", [token.lemma_ for token in doc_body if token.pos_ == "VERB"])

Verbs: ['ask', 'stump', 'make', 'solve', 'think', 'realize', 'ask', 'solve', 'figure', 'contain', 'find', 'be', 'be', 'be', 'use', 'contain', 'return', 'give', 'resolve', 'modify', 'modify', 'say', 'understand', 'use', 'use', 'put', 'cause', 'hoist', 'put', 'turn', 'be', 'hoist', 'attempt', 'track', 'get', 'store', 'submit', 'get', 'result', 'type', 'type', 'jump', 'observe', 'seem', 'happen', 'type', 'follow', 'situate', 'display', 'use', 'go', 'try', 'use', 'seem', 'work', 'appreciate', 'assign', 'have', 'extract', 'extract', 'find', 'output', 'need', 'be', 'be', 'do', 'think', 'make', 'store', 'have', 'think', 'put', 'have', 'have', 'have', 'have', 'work', 'come', 'firestore', 'understand', 'check', 'be', 'check', 'change', 'let', 'say', 'assume', 'modify', 'be', 'prevent', 'replace', 'want', 'find', 'let', 'update', 'know', 'go', 'modify', 'know', 'have', 'be', 'hear', 'import', 'be', 'import', 'be', 'be', 'make', 'sit', 'be', 'use', 'try', 'get', 'work', 'follow', 'work', 'try', '

In [29]:
for entity in doc_title.ents:
    print(entity.text, entity.label_)

React GPE
first ORDINAL
CRAN ORG
CRAN ORG
SQL ORG
WHERE & ' ORG
Python3 GPE
20 CARDINAL
PHP ORG
Move Input ORG
Outputs PERSON
jQuery ORG
Google Test C. How LAW
Pandas NORP
Linkedlist ORG
Git PERSON
Microk8s GPE
SQL Server ORG
2014 DATE
1 CARDINAL
Flow<List<T>> to List<T> ORG
second ORDINAL
Kotlin GPE
HashMap ORG
JSON & GSON ORG
Two CARDINAL
One CARDINAL
FFMPEG ORG
PK GPE
Gradient Arc View Not Showing Correctly PRODUCT
Django PERSON
Kafka PERSON
Aurora PERSON
MSK ORG
first ORDINAL
each month DATE
Remove Git LFS PERSON
0.229 CARDINAL
Toolbar GPE
ViewModel ORG
Matplotlib PERSON
7 CARDINAL
8 CARDINAL
Firebase ORG
close to CARDINAL
20 CARDINAL
two CARDINAL
UTC ORG
NotificationCenter ORG
Laravel ORG
only one CARDINAL
3.1 CARDINAL
12h DATE
MVC ORG
4 CARDINAL
12 CARDINAL


In [30]:
for entity in doc_body.ents:
    print(entity.text, entity.label_)

two CARDINAL
first ORDINAL
one CARDINAL
second ORDINAL
at least one CARDINAL
two CARDINAL
two CARDINAL
two CARDINAL
0 CARDINAL
1 CARDINAL
24 CARDINAL
2 CARDINAL
35 CARDINAL
1 CARDINAL
24 CARDINAL
the Comprehensive R Archive Network ORG
CRAN ORG
CRAN ORG
R. NORP
1 MONEY
OUTPUT ORG
2 MONEY
2 MONEY
OUTPUT ORG
3 MONEY
3 MONEY
over 3 hours TIME
PHP ORG
2 CARDINAL
3 CARDINAL
Inputs NORP
Outputs of a Parent Component ORG
Typescript GPE
Parent GPE
Child Component ORG
Typescript GPE
Pandas NORP
two CARDINAL
7 days DATE
7 days DATE
the remaining days DATE
Pandas NORP
Datanovice ORG
Laravel GPE
2 CARDINAL
3 CARDINAL
& Value ORG
2 CARDINAL
SHELL ORG
1 CARDINAL
2 CARDINAL
Output 1 PRODUCT
1 CARDINAL
2 CARDINAL
C11 ORG
zero CARDINAL
zero CARDINAL
zero CARDINAL
second ORDINAL
SQL Server ORG
2014 DATE
A few days ago DATE
SQL Server Management Studio ORG
SQL Server Management Studio ORG
Login Failed PERSON
A. Reason PERSON
Only one CARDINAL
Microsoft SQL Server ORG
18461 DATE
two CARDINAL
75% PERCENT
2

In [44]:
doc = nlp(df_base['Title'].str.cat())

In [32]:
set([token.pos_ for token in doc_title ])

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X'}

In [15]:
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Noun phrases: ['I', 'an array', 'elements', 'I', 'the following code', 'this form', 'parameter', 'dimension', 'being', 'what', 'I', 'large dimension', 'a lot', 'overhead', 'any other low complexity method', 'this matrix', 'Thanks']


In [16]:
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Verbs: ['try', 'create', 'be', 'place', 'write', 'follow', 'get', 'be', 'do', 'need', 'cause', 'get']


In [28]:
df_bs4.loc[70258]

Id                                                       65198394
Title           SciSpacy equivalent of Gensim's functions/para...
Body            <p>With Gensim, there are three functions I us...
Tags                                 <python><nlp><spacy><gensim>
CreationDate                                  2020-12-08 11:52:07
Bs4_Body        With Gensim, there are three functions I use r...
Code            model = gensim.models.Word2Vec(corpus,size=100...
Bs4_Tags                                            [python, nlp]
Name: 70258, dtype: object

In [18]:
df_bs4.loc[56967][['Title','Bs4_Body','Bs4_Tags']].values

array(['Parsing dictionaries in a JSON file with Python',
       'I\'m fairly new to Python and JSON, and I\'m having a bit of trouble parsing through this data: This is what\'s inside of the file, and if I understand correctly, this is a dictionary ("tests":) with dictionaries inside of it that are comma-separated, where each dictionary has a kvp of array:list, target:int. Please correct me if I am wrong on this part. Now, what I\'m trying to do is loop through each of the dictionaries and print the list then integer of each. So far, this is what I have in Python: but all I\'m printing out is this: array target array target array target array target array target array target array target array target array target array target array target array target I guess what I\'m trying to ask is how to print the values of this instead of the keys. Any help is appreciated, sorry for the noob question.',
       list(['python', 'json', 'loops', 'dictionary'])], dtype=object)