In [None]:
import google.auth
import pandas as pd
import networkx as nx
import os
import numpy as np

In [None]:
def authenticate_to_gcp(project='research-prototypes', path_to_creds='~/creds/ssikdar-creds.json'):
    if path_to_creds:
        os.environ['GOOGLE_APP_CREDENTIALS'] = path_to_creds
    os.environ['GOOGLE_CLOUD_PROJECT'] = project
    creds, project_id = google.auth.default()
    return creds, project_id

In [None]:
creds, proj = authenticate_to_gcp()

Dataset details before clean

| Name         | Users | Links |
|--------------|-------|-------|
| Grenoble, FR | 44k   | 350k  |
| Waterloo, CA | 47k   | 650k  |
| Uppsala, SE  | 65k   | 1.6M  |
| Vancouver, CA| 500k  |  14M  |



In [None]:
dataset = 'grenoble'; loc = 'FR'
dataset = 'waterloo'; loc = 'CA'
dataset = 'uppsala'; loc = 'SE'

In [None]:
query = f'SELECT * FROM `research-prototypes.attributed_user_graphs.{dataset}_{loc}_user_attributes_20200720`'
print(query)
user_attr_df = pd.read_gbq(query=query, credentials=creds, project_id=proj)

In [None]:
user_attr_df.head()

In [None]:
user_attr_df.gender.value_counts().plot(kind='barh');

In [None]:
cleaned_user_attr_df = user_attr_df[user_attr_df.gender!='UNKNOWN']  # drop the UNKNOWN gender
cleaned_user_attr_df = cleaned_user_attr_df[cleaned_user_attr_df.isPhoneVerified==True]  # only verified phones

In [None]:
cleaned_user_attr_df.gender.value_counts().plot(kind='barh');

In [None]:
cleaned_user_attr_df.deviceType.value_counts().plot(kind='barh');

In [None]:
cleaned_user_attr_df.isEmailVerified.value_counts().plot(kind='barh');

In [None]:
l1 = user_attr_df.shape[0]
l2 = cleaned_user_attr_df.shape[0]
print(l1, l2, l2 / l1)

Dataset details after dropping unknown gender and only verified phones

| Name         | Users | Directed Links | LCC nodes | LCC Links |
|--------------|-------|-------|--------------------|-----------|
| Grenoble, FR | 33.8k (77%) | 304k | 17k   | 191k
| Waterloo, CA | 36k (75%)   |   | 23k | 188k 
| Uppsala, SE  | 53k (80%)   |   | 44k | 593k
| Vancouver, CA|   |   |



In [None]:
cleaned_user_attr_df.to_csv(f'./cleaned-data/{dataset}_users_cleaned_all.csv', index=False)

 -------

In [None]:
# read the edgelist
query = f'SELECT * FROM `research-prototypes.attributed_user_graphs.{dataset}_{loc}_edgelist_20200720`'
print(query)
edges_df = pd.read_gbq(query=query, credentials=creds, project_id=proj)

In [None]:
display(edges_df.head())
display(edges_df.shape)

In [None]:
edges_df.drop_duplicates(inplace=True)  # drop parallel edges

In [None]:
edges_df.shape

In [None]:
whole_g = nx.from_pandas_edgelist(edges_df, source='src', target='dst', 
                                  create_using=nx.Graph)

In [None]:
print(nx.info(whole_g), '\n', nx.number_connected_components(whole_g), 'components')

In [None]:
whole_g.remove_edges_from(nx.selfloop_edges(whole_g))  # drop selfloops

In [None]:
print(nx.info(whole_g))

In [None]:
cleaned_user_attr_df = cleaned_user_attr_df[cleaned_user_attr_df.ghost_user_id.isin(whole_g.nodes)]

In [None]:
nodes_wo_attrs = set(whole_g.nodes) - set(cleaned_user_attr_df.ghost_user_id.values)

In [None]:
print(f'{len(nodes_wo_attrs)} nodes dont have user attrs')

In [None]:
filtered_g = nx.Graph(whole_g)
filtered_g.remove_nodes_from(nodes_wo_attrs)

In [None]:
lcc = max(nx.connected_components(filtered_g), key=len)
g_lcc = filtered_g.subgraph(lcc).copy()

In [None]:
print(nx.info(filtered_g))
print(nx.info(g_lcc))

In [None]:
lcc_node_attrs_df = user_attr_df[user_attr_df.ghost_user_id.isin(set(g_lcc.nodes))]

In [None]:
g_lcc.order(), lcc_node_attrs_df.shape

In [None]:
lcc_node_attrs_df.gender.value_counts()

In [None]:
print(nx.info(g_lcc))

In [None]:
nx.write_edgelist(g_lcc, f'./cleaned-data/{dataset}_lcc.g', data=False)

In [None]:
lcc_node_attrs_df.to_csv(f'./cleaned-data/{dataset}_lcc_node_attrs.csv', index=False)

In [None]:
# add attributes to the graph -- get the attribute values as dictionaries
d = lcc_node_attrs_df.to_dict()

In [None]:
list(g_lcc.nodes(data=True))[: 5]

In [None]:
attrs = 'gender', 'age_bucket', 'deviceType', 
node_attr_dicts = {attr: {} for attr in attrs}
for row in lcc_node_attrs_df.itertuples():
    node_attr_dicts['gender'][row.ghost_user_id] = row.gender
    node_attr_dicts['age_bucket'][row.ghost_user_id] = row.age_bucket
    node_attr_dicts['deviceType'][row.ghost_user_id] = row.deviceType

In [None]:
len(node_attr_dicts['gender'])

In [None]:
nx.set_node_attributes(g_lcc, name='gender', values=node_attr_dicts['gender'])
nx.set_node_attributes(g_lcc, name='deviceType', values=node_attr_dicts['deviceType'])
nx.set_node_attributes(g_lcc, name='age_bucket', values=node_attr_dicts['age_bucket'])

In [None]:
list(g_lcc.nodes(data=True))[: 5]

In [None]:
nx.write_gpickle(g_lcc, f'./cleaned-data/{dataset}_lcc_attr.gpickle')

In [None]:
g_lcc_unpickled = nx.read_gpickle(f'./cleaned-data/{dataset}_lcc_attr.gpickle')

In [None]:
list(g_lcc.nodes(data=True))[: 4]

In [None]:
g_lcc_unpickled.order(), g_lcc_unpickled.size()