# Tags vectors:

In order to do UMAP and clustering, a data set with tags as index and their vector representation is required. This notebook generates the “.csv”. Throughout this process each tag will get a 300 long vector.

In [1]:
from babylon.preprocess import GloveVectorizer
import pandas as pd
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
glove_vectorizer = GloveVectorizer(model_name='en_core_web_lg')

In [3]:
df1 = pd.read_csv('companies_funding_tag_reshape_description_short_description_ver2.csv')

df_main = df1.groupby([ 'investor','clean_url','tag_reshape'],as_index = False)[['short_description']].count()
df_main['tag_len'] = df_main.tag_reshape.apply(lambda x: len(x))
df_index = df_main.copy()

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_index.tag_reshape)
df_index['tag_token'] = tokenizer.texts_to_sequences(df_index.tag_reshape)
sequences = tokenizer.texts_to_sequences(df_index.tag_reshape)

word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % len(word_index))


Found 3323 unique tokens.


In [5]:
word_index_nostop = defaultdict(float)
for k,v in word_index.items():
    if k in STOP_WORDS:
        pass
    else:
        word_index_nostop[k] = v
len(word_index_nostop)

3271

In [6]:
idx2word = {v: k for k, v in word_index_nostop.items()}
', '.join(map(str, sequences[0]))

'18, 125, 2133, 2134, 977, 633, 135, 3, 39, 1295, 28, 20, 98, 9, 1, 26, 6'

In [7]:
dic = defaultdict(float)
for k,v in idx2word.items():
    word = idx2word[k]
    try:
        dic[word] = glove_vectorizer.transform([word])[0]
    except:
        print (word)

In [8]:
df_index_wordim = pd.DataFrame.from_dict(dic, orient='index')
df_index_wordim.shape

(3271, 300)

In [9]:
df_index_wordim.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
software,-0.56298,0.030181,0.18973,-0.37291,-0.13457,0.33238,0.094127,-0.54725,0.4374,1.0045,...,-0.6708,0.005053,-0.046027,1.1469,0.22121,-0.16935,0.11114,0.09371,-0.10934,-0.14476
mobile,-0.14581,0.36688,0.31404,0.33013,0.55531,-0.57337,0.42209,-0.33195,0.24277,1.1408,...,-0.60999,-0.62099,-0.028642,0.34581,-0.17122,-0.069529,0.15257,0.024808,-0.58463,0.50071
health,-0.32881,0.21108,0.043552,0.13979,-0.52884,-0.051644,-0.33082,-0.12381,-0.027482,3.2725,...,0.092175,0.054948,-0.003367,-0.076881,-0.05531,0.057904,0.016996,-0.23823,0.15785,0.24124
social,0.13362,0.47258,0.19699,0.04076,-0.46243,-0.060541,0.011001,0.23503,0.4257,3.0092,...,-0.52795,-0.005432,0.474,0.50459,0.27856,0.23862,-0.036539,-0.075791,-0.45408,0.16939
media,-0.28111,0.23404,0.42548,0.037192,-0.33655,0.041839,0.043703,-0.27556,0.090093,2.2483,...,-0.66212,-0.26457,0.17922,-0.14027,0.43472,-0.12136,-0.44727,-0.1057,-0.21601,0.062703


In [10]:
df_index_wordim.to_csv('./data/tag_glove_word2vec_ver2.csv')