In [1]:
import umap
import numpy as np
import pandas as pd

import networkx as nx
from gensim.models import KeyedVectors
from nodevectors import Node2Vec, GGVec

import plotly.express as px

In [2]:
## Loding the data from various sources

In [3]:
brand_ids = pd.read_csv('./SocialTalk/Clean/brand_ids.csv', low_memory=False)
account_brands = pd.read_csv('./SocialTalk/Clean/accounts-brands.csv', low_memory=False)

interest_ids = pd.read_csv('./SocialTalk/Clean/interest_ids.csv', low_memory=False)
account_interests = pd.read_csv('./SocialTalk/Clean/accounts-interests.csv', low_memory=False)

city_ids = pd.read_csv('./SocialTalk/Clean/city_ids.csv', low_memory=False)
country_ids = pd.read_csv('./SocialTalk/Clean/country_ids.csv', low_memory=False)

users = pd.read_csv('./SocialTalk/Clean/users.csv', low_memory=False)

In [4]:
## adding a label or brand <column name>:value

In [5]:
brand_ids['label'] = brand_ids['Name'].apply(lambda x: f'Brand:{x}')
account_brands['Account Id'] = account_brands['Account Id'].apply(lambda x: f'Account:{x}')
account_brands['Brand Id'] = account_brands['Brand Id'].map(brand_ids.set_index('Id')['label'].to_dict())

interest_ids['label'] = interest_ids['Name'].apply(lambda x: f'Interest:{x}')
account_interests['Account Id'] = account_interests['Account Id'].apply(lambda x: f'Account:{x}')
account_interests['Interest Id'] = account_interests['Interest Id'].map(interest_ids.set_index('Id')['label'].to_dict())

# city_ids['label'] = city_ids['Name'].apply(lambda x: f'City:{x}')
# country_ids['label'] = country_ids['Name'].apply(lambda x: f'Country:{x}')

users['label'] = users['Id'].apply(lambda x: f'Account:{x}')

In [6]:
## converting zero values to 0.1 and taking log scale

In [7]:
numeric_data = users.iloc[:, 4:29]

take_the_log = [
    'Followers', 'Following', 'Posts', 
    'Engagement', 'Estimated reach', 'Estimated impressions', 
    'Avg. likes per post', 'Avg. engagement per post',
    'Avg. comments per post', 'Avg. views per video',
    'Cost per story (MIN)', 'Cost per story (MAX)'
    ]
for column in take_the_log:
    numeric_data[column] = numeric_data[column].apply(lambda x: 0.1 if (x == 0) else x)
    numeric_data[column] = numeric_data[column].apply(np.log)

In [8]:
## converted all the values into 3 bins and assigned label as low, average & high

In [9]:
numeric_as_categorical = numeric_data.apply(lambda x: pd.cut(x, 3, labels=["low", "average", "high"], duplicates='drop'))

In [10]:
## adding names for catagorical values by mapping it their code

In [11]:
clean_df = users.iloc[:, :4]
# clean_df = pd.concat([users.iloc[:, :4], numeric_as_categorical], axis=1)
clean_df['Id'] = clean_df['Id'].apply(lambda x: f'Account:{x}')
clean_df = clean_df.rename(columns={'Country id': 'Country', 'City id': 'City'}).set_index('Id')

clean_df['City'] = clean_df['City'].map(city_ids.set_index('Id')['Name'].to_dict())
clean_df['Country'] = clean_df['Country'].map(country_ids.set_index('Id')['Name'].to_dict())

In [12]:
## creating edges from users data

In [13]:
edges = []
for column in clean_df.columns:
    series_tuples = [
        (account, f'{column}:{value}') for account, value in clean_df[column].items()
        ]
    edges.extend(series_tuples)

In [14]:
## creating edges from country to city

In [15]:
location_hierarchy = [
    (f'Country:{country}', f'City:{city}') for country, city in clean_df[['Country', 'City']].values
    if (not country.endswith('UNKNOWN') and not city.endswith('UNKNOWN'))
]
edges.extend(location_hierarchy)

In [16]:
## creating edges from brands data

In [17]:
edges += [i for i in account_brands.set_index('Account Id')['Brand Id'].items()]

In [18]:
## creating edges from Interests

In [19]:
edges += [i for i in account_interests.set_index('Account Id')['Interest Id'].items()]

In [20]:
## Adding edges from account

In [21]:
temp = users.set_index('label').iloc[:, 29:-2]
for column in temp:
    series_tuples = [
        (account, f'{column}:True') for account, value in temp[column].items()
        ]
    edges.extend(series_tuples)

In [22]:
## removing all the edges that have value as low (numerical columns)

In [23]:
e2 = [i for i in edges if not i[1].endswith('low')]
len(e2)

454023

### Generate node embeddings

In [24]:
G = nx.from_pandas_edgelist(pd.DataFrame(e2, columns=['source', 'target']))

In [None]:
g2v = Node2Vec()
n2v_embeddings = g2v.fit(G)

# Save model to gensim.KeyedVector format
g2v.save_vectors("wheel_model.bin")

# # load in gensim
# model = KeyedVectors.load_word2vec_format("wheel_model.bin")

Making walks... 

In [1]:
## word:embedding  stored in key value pair

In [1]:
words = list(g2v.model.wv.vocab)
g2v_embeds = pd.DataFrame([g2v.model.wv[i] for i in words], index=words)

NameError: name 'g2v' is not defined