# Ingest
- [Parsing](#bs4-parsing)</BR>
- [Labelling](#Labelling)</BR>
- [Stemming](#Stemming)</BR>
- [Lemmatization](#Lemmatization)</BR>
- [Corpus](#Corpus)</BR>
- [Count vectorizer](#Count-vectorizer)</BR>
- [Tfidf vectorizer](#Tfidf-vectorizer)</BR>

In [1]:
import importlib
import os
from joblib import dump, load
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk import download as nltk_download
from nltk import tokenize, RegexpTokenizer, pos_tag
from nltk.corpus import stopwords, wordnet
'''Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.'''
from nltk.stem import WordNetLemmatizer
nltk_download('wordnet') 
nltk_download('punkt')
nltk_download('stopwords')
nltk_download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import ml.explo as mlexplo
import ml.prepare as mlprepare

from nltk.stem.snowball import SnowballStemmer

data_path = "..\\data\\stackoverflow\\"
model_path = data_path + "models\\"

np.set_printoptions(precision=2, suppress=True)
pd.options.display.float_format = "{:,.2f}".format

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# data_filename = "OC_DS_P6.csv"
data_csv_1 = 'OC_ML_P5_2006_2009.csv'
data_csv_2 = 'OC_ML_P5_2010_2102.csv'
data_csv_test = 'OC_ML_P5_TEST.csv' 
tags_filename = "Stackoverflow_top_tags.csv"

# bs4 parsing
- [Toc](#Ingest)

In [3]:
def apply_bs4(raw_text):
#     return '. '.join(
#         [x.find(text=True)
#          for x in BeautifulSoup(raw_text, 'html.parser').find_all('p')
#          if x.find(text=True)]
    soup = BeautifulSoup(raw_text, 'html.parser')
    # getting rid of programming language identification for now,
    # could rely on guesslang package but conflict with tensorflow installed version and 
    # don't wanna downgrade, therefore code stored in dedicated variable for futur use.
    code = ""
    for s in soup.find_all('code'):
        code = code + s.get_text()
        s.extract()
    return re.sub(r'\n+', ' ', soup.get_text()).strip(), re.sub(r'\n+', ' ', code).strip()
        
def apply_tags(raw_text):
    return [ tag.name for tag in BeautifulSoup(raw_text, 'html.parser').find_all() if tag.name in valid_tags ]

In [4]:
df_tags = pd.read_csv(data_path + tags_filename)
valid_tags = df_tags['TagName'].tolist()
nltk_stop_words = set(stopwords.words('english') + 
    # Remove after stemming and lemmatization top words frequencies analysis
    ['use', 'try', 'way', 'want', 'would', 'need', 'one', '1', '2' ] +
    # Remove after NMF analysis
    ['something', 'desire', 'follow ', 'help', 'please', 'however', 'follow', 'code' ])


init_bs4 = False
if init_bs4:
    df_csv_1 = pd.read_csv(data_path + data_csv_1)
    df_csv_1['Bs4_Body'], df_csv_1['Code'] = zip(*df_csv_1['Body'].apply(apply_bs4))
    df_csv_1['Bs4_Tags'] = df_csv_1['Tags'].apply(apply_tags)

    df_csv_2 = pd.read_csv(data_path + data_csv_2)
    df_csv_2['Bs4_Body'], df_csv_2['Code'] = zip(*df_csv_2['Body'].apply(apply_bs4))
    df_csv_2['Bs4_Tags'] = df_csv_2['Tags'].apply(apply_tags)
    
    df_bs4_test = pd.read_csv(data_path + data_csv_test)
    df_bs4_test['Bs4_Body'], df_bs4_test['Code'] = zip(*df_bs4_test['Body'].apply(apply_bs4))
    df_bs4_test['Bs4_Tags'] = df_bs4_test['Tags'].apply(apply_tags)

    df_bs4 = pd.concat([df_csv_1,df_csv_2], ignore_index=True)
    df_bs4.to_pickle(data_path + 'bs4.pkl')
    df_bs4_test.to_pickle(data_path + 'bs4_test.pkl')
else:
    df_bs4 = pd.read_pickle(data_path + 'bs4.pkl')
    df_bs4_test = pd.read_pickle(data_path + 'bs4_test.pkl')

# Labelling
- [Toc](#Ingest)

In [5]:
def filter_top_tags(df, column, count=10, default=None):
    top_tags = mlprepare.prepare_multi_label(df_full_tags,'Bs4_Tags')[:count]
    ret = df[column].apply(lambda cell: [x for x in cell if x in top_tags])
    if default:
        ret = ret.apply(lambda cell: cell if len(cell)>0 else ['other'])
    return ret

In [6]:
init_labels = False
df_bs4 = pd.read_pickle(data_path + 'bs4.pkl')
df_bs4_test = pd.read_pickle(data_path + 'bs4_test.pkl')
df_full_tags = pd.DataFrame(pd.concat([df_bs4['Bs4_Tags'],df_bs4_test['Bs4_Tags']]))

if(init_labels):
    df_top = pd.DataFrame()
    df_top['Tags_T100'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=100, default='other')
    df_top['Tags_T50'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=50, default='other')
    df_top['Tags_T10'] = filter_top_tags(df_bs4, 'Bs4_Tags', count=10, default='other')
    df_top.to_pickle(data_path + 'Top_Tags.pkl')

    df_top_test = pd.DataFrame()
    df_top_test['Tags_T100'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=100, default='other')
    df_top_test['Tags_T50'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=50, default='other')
    df_top_test['Tags_T10'] = filter_top_tags(df_bs4_test, 'Bs4_Tags', count=10, default='other')
    df_top_test.to_pickle(data_path + 'Top_Tags_test.pkl')

else:
    df_top = pd.read_pickle(data_path + 'Top_Tags.pkl')
    df_top_test = pd.read_pickle(data_path + 'Top_Tags_test.pkl')

# Stemming
- [Toc](#Ingest)

In [7]:
def nltk_stop_stemmer(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
    title_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Title'].lower()) if stemmer.stem(w) not in nltk_stop_words ] 
    body_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Bs4_Body'].lower()) if stemmer.stem(w) not in nltk_stop_words ]
    return title_tokens, body_tokens

In [8]:
init_stem = False
if init_stem:
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    df_stop_stem = pd.DataFrame()
    df_stop_stem = df_bs4.apply(nltk_stop_stemmer, axis=1, result_type='expand')
    df_stop_stem.columns=['Title', 'Body']
    df_stop_stem.to_pickle(data_path + 'nltk_stop_stem.pkl')

    df_stop_stem_test = pd.DataFrame()
    df_stop_stem_test = df_bs4_test.apply(nltk_stop_stemmer, axis=1, result_type='expand')
    df_stop_stem_test.columns=['Title', 'Body']
    df_stop_stem_test.to_pickle(data_path + 'nltk_stop_stem_test.pkl')
else:
    df_stop_stem = pd.read_pickle(data_path + 'nltk_stop_stem.pkl')
    df_stop_stem_test = pd.read_pickle(data_path + 'nltk_stop_stem_test.pkl')

In [9]:
df_top.shape

(96420, 3)

In [10]:
[' '.join(map(str, w)) for w in df_stop_stem['Title'].tolist()]

['panda pivot datafram valu function two column',
 'polymorph function paramet',
 'loop list dictionari condit key exist',
 'cpqueri function sever pair dataset',
 'split number delimit ani non numer string',
 'increas clickabl area imageview',
 'make work function insid statement',
 'check row panda seri contain string list appli',
 'zero enum valu indic invalid valu',
 'tri segment charact opencv ilumin problem',
 'add new key dictionari base condit match two exist dict key valu python',
 'voxel cone trace defer pipelin',
 'assign empti initi vector unique_ptr',
 'custom legend marker size matplotlib lambda function',
 'show proccess output text box frame python tkinter',
 'restrict s3 sub folder level bucket polici privat public',
 'whi mousedown event trigger mouseup event',
 'python nan return panda resampl function',
 'wait ajax respons',
 'convert list insid tupl ad tupl',
 'find first dupe across axi base key row',
 'chang window size new window termin',
 'googl sheet dget func

In [11]:
c = 211
[ (i,j) for i,j in zip(df_stop_stem.loc[c,'Body'],df_lemma_stop.loc[c,'Body']) if i != j ]

NameError: name 'df_stop_stem' is not defined

# Lemmatization
- [Toc](#Ingest)

In [7]:
wn = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def nltk_lemmatizer_stop(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
    title_tokens = [ lemma for lemma in 
            [ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(row['Title'].lower()) ]
            if lemma not in nltk_stop_words ] 
    body_tokens = [ lemma for lemma in 
            [ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(row['Bs4_Body'].lower()) ]
            if lemma not in nltk_stop_words ]
    return title_tokens, body_tokens

In [9]:
phrase = "js and css files extentions are naked leaving the washing room"
[ wn.lemmatize(w, get_wordnet_pos(w)) for w in tokenizer.tokenize(phrase.lower()) ]

['j',
 'and',
 'cs',
 'file',
 'extentions',
 'be',
 'naked',
 'leave',
 'the',
 'wash',
 'room']

In [10]:
init_lemma = False
if init_lemma:
    tokenizer = RegexpTokenizer(r'\w+')
#     stemmer = SnowballStemmer("english")
    wn = WordNetLemmatizer()

    df_lemma_stop = pd.DataFrame()
    df_lemma_stop = df_bs4.apply(nltk_lemmatizer_stop, axis=1, result_type='expand')
    df_lemma_stop.columns=['Title', 'Body']
    df_lemma_stop.to_pickle(data_path + 'nltk_lemma_stop.pkl')

    df_lemma_stop_test = pd.DataFrame()
    df_lemma_stop_test = df_bs4_test.apply(nltk_lemmatizer_stop, axis=1, result_type='expand')
    df_lemma_stop_test.columns=['Title', 'Body']
    df_lemma_stop_test.to_pickle(data_path + 'nltk_lemma_stop_test.pkl')
else:
    df_lemma_stop = pd.read_pickle(data_path + 'nltk_lemma_stop.pkl')
    df_lemma_stop_test = pd.read_pickle(data_path + 'nltk_lemma_stop_test.pkl')

# Corpus
- [Toc](#Ingest)

In [13]:
build_corpus=False

In [14]:
if build_corpus:
    stem_title_corpus = [' '.join(map(str, w)) for w in df_stop_stem['Title'].tolist()]
    stem_title_corpus_test = [' '.join(map(str, w)) for w in df_stop_stem_test['Title'].tolist()]
    stem_body_corpus = [' '.join(map(str, w)) for w in df_stop_stem['Body'].tolist()]
    stem_body_corpus_test = [' '.join(map(str, w)) for w in df_stop_stem_test['Body'].tolist()]
    dump(stem_title_corpus, data_path + 'stem_title_corpus.joblib')
    dump(stem_title_corpus_test, data_path + 'stem_title_corpus_test.joblib')
    dump(stem_body_corpus, data_path + 'stem_body_corpus.joblib')
    dump(stem_body_corpus_test, data_path + 'stem_body_corpus_test.joblib')
else:
    stem_title_corpus = load(data_path + 'stem_title_corpus.joblib')
    stem_title_corpus_test =  load(data_path + 'stem_title_corpus_test.joblib')
    stem_body_corpus = load(data_path + 'stem_body_corpus.joblib')
    stem_body_corpus_test = load(data_path + 'stem_body_corpus_test.joblib')

In [15]:
if build_corpus:
    lemma_title_corpus = [' '.join(map(str, w)) for w in df_lemma_stop['Title'].tolist()]
    lemma_title_corpus_test = [' '.join(map(str, w)) for w in df_lemma_stop_test['Title'].tolist()]
    lemma_body_corpus = [' '.join(map(str, w)) for w in df_lemma_stop['Body'].tolist()]
    lemma_body_corpus_test = [' '.join(map(str, w)) for w in df_lemma_stop_test['Body'].tolist()]
    dump(lemma_title_corpus, data_path + 'lemma_title_corpus.joblib')
    dump(lemma_title_corpus_test, data_path + 'lemma_title_corpus_test.joblib')
    dump(lemma_body_corpus, data_path + 'lemma_body_corpus.joblib')
    dump(lemma_body_corpus_test, data_path + 'lemma_body_corpus_test.joblib')
else:
    lemma_title_corpus = load(data_path + 'lemma_title_corpus.joblib')
    lemma_title_corpus_test = load(data_path + 'lemma_title_corpus_test.joblib')
    lemma_body_corpus = load(data_path + 'lemma_body_corpus.joblib')
    lemma_body_corpus_test = load(data_path + 'lemma_body_corpus_test.joblib')
    

# Count vectorizer
- [Toc](#Ingest)

In [16]:
save_vectorizer = False

In [17]:
title_count_vectorizer = CountVectorizer(ngram_range=(1,1), max_features=1000)
title_count_vectorizer.fit(lemma_title_corpus)
if save_vectorizer:
    dump(title_count_vectorizer, model_path + 'lemma_title_count_vectorizer.joblib')

In [19]:
body_count_vectorizer = CountVectorizer(ngram_range=(1,1), max_features=5000)
body_count_vectorizer.fit(lemma_body_corpus)
if save_vectorizer:
    dump(body_count_vectorizer, model_path + 'lemma_body_count_vectorizer_11.joblib')

In [19]:
body_count_vectorizer2 = CountVectorizer(ngram_range=(1,2), max_features=5000)
body_count_vectorizer2.fit(lemma_body_corpus)
if save_vectorizer:
    dump(body_count_vectorizer2, model_path + 'lemma_body_count_vectorizer_12.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_count_vectorizer_12.joblib']

In [20]:
body_count_vectorizer3 = CountVectorizer(ngram_range=(1,3), max_features=5000)
body_count_vectorizer3.fit(lemma_body_corpus)
if save_vectorizer:
    dump(body_count_vectorizer3, model_path + 'lemma_body_count_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_count_vectorizer_13.joblib']

In [24]:
arr = body_count_vectorizer.transform(df_lemma_stop.loc[1500:1500,'Body'].apply(lambda row: ' '.join(row))).toarray()[0]
for cpt in np.arange(len(arr)):
    if arr[cpt] > .01:
        for k in body_count_vectorizer.vocabulary_.keys():
            if body_count_vectorizer.vocabulary_[k] == cpt:
                print(cpt, arr[cpt], k)

366 1 animation
1181 1 custom
1398 1 disabled
1623 1 error
1818 1 fix
1825 1 flatten
1963 2 get
2228 1 improve
2577 1 like
2682 1 manage
2816 1 modal
2973 1 note
3221 1 performance
3691 1 render
4181 1 standard
4432 2 textfield
4440 1 thanks
4629 1 unable
4794 1 version
4803 1 view


# Tfidf vectorizer
- [Toc](#Ingest)

## Stem vectorization

In [None]:
save_vectorizer = False

In [21]:
title_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
title_vectorizer.fit(stem_title_corpus)
if save_vectorizer:
    dump(title_vectorizer, model_path + 'stem_title_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\stem_title_vectorizer.joblib']

In [22]:
body_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
body_vectorizer.fit(stem_body_corpus)
if save_vectorizer:
    dump(body_vectorizer, model_path + 'stem_body_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\stem_body_vectorizer_11.joblib']

In [None]:
body_vectorizer2 = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
body_vectorizer2.fit(stem_body_corpus)
if save_vectorizer:
    dump(body_vectorizer2, model_path + 'stem_body_vectorizer_12.joblib')

In [24]:
body_vectorizer3 = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
body_vectorizer3.fit(stem_body_corpus)
if save_vectorizer:
    dump(body_vectorizer3, model_path + 'stem_body_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\stem_body_vectorizer_13.joblib']

In [18]:
df_bs4.loc[75,'Body']

'<p>I am having two dataframe like described below</p>\n\n<p><strong>Dataframe 1</strong></p>\n\n<pre><code>P_ID     P_Name     P_Description   P_Size\n\n100      Moto          Mobile         16\n\n200      Apple         Mobile         15\n\n300      Oppo          Mobile         18\n</code></pre>\n\n<p><strong>Dataframe 2</strong></p>\n\n<pre><code>P_ID     List_Code      P_Amount     \n\n100      ALPHA           20000         \n\n100      BETA            60000  \n\n300      GAMMA           15000    \n</code></pre>\n\n<p><strong>Requirement :</strong>\nNeed to join the two dataframe by P_ID. </p>\n\n<p><strong>Information about the dataframe :</strong>\nIn dataframe 1 P_ID is a primary key and dataframe 2 does\'t have any primary attribute.  </p>\n\n<p><strong>How to join the dataframe</strong>\nNeed to create new columns in dataframe 1 from the value of dataframe 2 List_Code appends with "_price". If dataframe 2 List_Code contains 20 unique values we need to create 20 column in datafr

In [19]:
df_bs4.loc[75,'Bs4_Body']

'I am having two dataframe like described below Dataframe 1 Dataframe 2 Requirement : Need to join the two dataframe by P_ID.  Information about the dataframe : In dataframe 1 P_ID is a primary key and dataframe 2 does\'t have any primary attribute.   How to join the dataframe Need to create new columns in dataframe 1 from the value of dataframe 2 List_Code appends with "_price". If dataframe 2 List_Code contains 20 unique values we need to create 20 column in dataframe 1. Then, we have fill the value in newly created column in dataframe 1 from the dataframe 2 P_Amount column based on P_ID if present else fills with zero. After creation of dataframe we need to join the dataframe based on the P_ID. If we add the column with the expected value in dataframe 1 we can join the dataframe. My problem is creating new columns with the expected value.  The expected dataframe is shown below   Expected dataframe Can you please help me to solve the problem, thanks in advance.'

In [20]:
df_bs4.loc[75,'Code']

'P_ID     P_Name     P_Description   P_Size 100      Moto          Mobile         16 200      Apple         Mobile         15 300      Oppo          Mobile         18 P_ID     List_Code      P_Amount      100      ALPHA           20000          100      BETA            60000   300      GAMMA           15000        P_ID     P_Name     P_Description   P_Size   ALPHA_price   BETA_price    GAMMA_price     100      Moto          Mobile         16       20000       60000           0     200      Apple         Mobile         15         0            0            0     300      Oppo          Mobile         18         0            0           15000'

In [20]:
df_base = df_bs4[:10][['Title','Bs4_Body']]
tokenizer = RegexpTokenizer(r'\w+')

## Lemma vectorization

In [25]:

title_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=1000)
title_vectorizer.fit(lemma_title_corpus)
dump(title_vectorizer, model_path + 'lemma_title_vectorizer.joblib')

['..\\data\\stackoverflow\\models\\lemma_title_vectorizer.joblib']

In [26]:
body_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
body_vectorizer.fit(lemma_body_corpus)
dump(body_vectorizer, model_path + 'lemma_body_vectorizer_11.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_vectorizer_11.joblib']

In [None]:
body_vectorizer2 = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
body_vectorizer2.fit(lemma_body_corpus)
dump(body_vectorizer2, model_path + 'lemma_body_vectorizer_12.joblib')

In [28]:
body_vectorizer3 = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
body_vectorizer3.fit(lemma_body_corpus)
dump(body_vectorizer3, model_path + 'lemma_body_vectorizer_13.joblib')

['..\\data\\stackoverflow\\models\\lemma_body_vectorizer_13.joblib']

In [19]:
df_bs4.loc[1500,'Body']

'<p>I am trying to improve the animation performance of my custom modal view.</p>\n\n<p>By using .drawingGroup(), I\'ve managed to do so, but my TextField got disabled. Like so:</p>\n\n<p><a href="https://i.stack.imgur.com/gGWJIm.jpg" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/gGWJIm.jpg" alt="enter image description here"></a></p>\n\n<p>And I get the error: \'Unable to render flattened version of PlatformViewRepresentableAdaptor&lt;>\'</p>\n\n<p>Is there a way to fix this? </p>\n\n<p>Note: I am using a standard TextField.</p>\n\n<p>Thanks!</p>\n'

In [20]:
df_bs4.loc[1500,'Bs4_Body']

"I am trying to improve the animation performance of my custom modal view. By using .drawingGroup(), I've managed to do so, but my TextField got disabled. Like so: And I get the error: 'Unable to render flattened version of PlatformViewRepresentableAdaptor<>' Is there a way to fix this?  Note: I am using a standard TextField. Thanks!"

In [18]:
' '.join(df_stop_stem.loc[1500,'Body'])

'tri improv anim perform custom modal view drawinggroup manag textfield got disabl like get error unabl render flatten version platformviewrepresentableadaptor fix note standard textfield thank'

In [21]:
' '.join(df_lemma_stop.loc[1500,'Body'])

'improve animation performance custom modal view drawinggroup manage textfield get disabled like get error unable render flatten version platformviewrepresentableadaptor fix note standard textfield thanks'

In [28]:
df_bs4.loc[1500].values

array([62187654, 'SwiftUI - Using drawingGroup() disables TextField',
       '<p>I am trying to improve the animation performance of my custom modal view.</p>\n\n<p>By using .drawingGroup(), I\'ve managed to do so, but my TextField got disabled. Like so:</p>\n\n<p><a href="https://i.stack.imgur.com/gGWJIm.jpg" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/gGWJIm.jpg" alt="enter image description here"></a></p>\n\n<p>And I get the error: \'Unable to render flattened version of PlatformViewRepresentableAdaptor&lt;>\'</p>\n\n<p>Is there a way to fix this? </p>\n\n<p>Note: I am using a standard TextField.</p>\n\n<p>Thanks!</p>\n',
       '<ios><swift><swiftui><metal>', '2020-06-04 05:48:15',
       "I am trying to improve the animation performance of my custom modal view. By using .drawingGroup(), I've managed to do so, but my TextField got disabled. Like so: And I get the error: 'Unable to render flattened version of PlatformViewRepresentableAdaptor<>' Is there a way to fi

In [22]:
df_bs4.loc[1500][['Title','Bs4_Body','Bs4_Tags']].values

array(['SwiftUI - Using drawingGroup() disables TextField',
       "I am trying to improve the animation performance of my custom modal view. By using .drawingGroup(), I've managed to do so, but my TextField got disabled. Like so: And I get the error: 'Unable to render flattened version of PlatformViewRepresentableAdaptor<>' Is there a way to fix this?  Note: I am using a standard TextField. Thanks!",
       list(['ios', 'swift', 'swiftui'])], dtype=object)

In [26]:
df_stop_stem.loc[1500].values

array([list(['swiftui', 'drawinggroup', 'disabl', 'textfield']),
       list(['tri', 'improv', 'anim', 'perform', 'custom', 'modal', 'view', 'drawinggroup', 'manag', 'textfield', 'got', 'disabl', 'like', 'get', 'error', 'unabl', 'render', 'flatten', 'version', 'platformviewrepresentableadaptor', 'fix', 'note', 'standard', 'textfield', 'thank'])],
      dtype=object)

In [25]:
df_lemma_stop.loc[1500].values

array([list(['swiftui', 'drawinggroup', 'disables', 'textfield']),
       list(['improve', 'animation', 'performance', 'custom', 'modal', 'view', 'drawinggroup', 'manage', 'textfield', 'get', 'disabled', 'like', 'get', 'error', 'unable', 'render', 'flatten', 'version', 'platformviewrepresentableadaptor', 'fix', 'note', 'standard', 'textfield', 'thanks'])],
      dtype=object)

In [61]:
body_vectorizer2 = load(model_path + 'lemma_body_vectorizer_12.joblib')

In [65]:
arr = body_vectorizer2.transform(df_lemma_stop.loc[1500:1500,'Body'].apply(lambda row: ' '.join(row))).toarray()[0]
for cpt in np.arange(len(arr)):
    if arr[cpt] > .01:
        for k in body_vectorizer2.vocabulary_.keys():
            if body_vectorizer2.vocabulary_[k] == cpt:
                print(cpt, arr[cpt], k)

298 0.2247638118007204 animation
1090 0.16772431423318251 custom
1303 0.2461088308155791 disabled
1469 0.09669143036839227 error
1776 0.14742098783593938 fix
1784 0.26438561991765835 flatten
1919 0.14708056130015343 get
1936 0.13432187054233213 get error
2217 0.21706951414720943 improve
2502 0.07138535086166545 like
2533 0.18274425691290203 like get
2728 0.1836641143424463 manage
2854 0.24960412964013937 modal
3002 0.15404651255352478 note
3233 0.19808481334881126 performance
3643 0.18046725103997097 render
4146 0.18575552857328104 standard
4383 0.559684803779218 textfield
4392 0.11562409550372246 thanks
4578 0.19011867181300837 unable
4775 0.14615244843873879 version
4784 0.15593227619314137 view
