# Final_app_df.csv creation process

In [1]:
import pandas as pd
import numpy as np
import umap
import json

from gensim.models import KeyedVectors

In [2]:
with open('user_lookup.json', 'r') as f:
    user_lookup = json.load(f)

In [3]:
def get_visualization_df(embeddings, user_lookup):
    visuals = pd.DataFrame(embeddings, index=words, columns=['x', 'y', 'z'])
    visuals['size'] = 0.01
    visuals['node_type'] = [i.split(':')[0]  for i in visuals.index.tolist()]
    visuals = visuals.reset_index()

    account_nodes = visuals[visuals['node_type'] == 'Account']
    account_ids = account_nodes['index'].apply(lambda x: x.split(':')[1]).astype(str)
    account_metadata = pd.DataFrame(account_ids.map(user_lookup).values.tolist(), index=account_ids)
    account_metadata.index = [f'Account:{i}' for i in account_metadata.index]
    account_metadata = account_metadata.reset_index()

    final = visuals.merge(account_metadata, on='index', how='left')
    final['Name'] = final['Name'].fillna('')
    final['Followers'] = final['Followers'].fillna(0)
    final['Estimated reach'] = final['Estimated reach'].fillna(0)

    return final

In [4]:
model = KeyedVectors.load_word2vec_format("GGvec_model_v1.bin")
words = list(model.wv.vocab)
g2v_embeds = pd.DataFrame([model.wv[i] for i in words], index=words)

reducer = umap.UMAP(n_components=3, min_dist=0.25, n_neighbors=100)
embeddings = reducer.fit_transform(g2v_embeds)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
final = get_visualization_df(embeddings, user_lookup)
final.to_csv('final_app_input_df_1.csv', index=None)