In [71]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import pickle
import dgl
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

# Load data

In [None]:
#https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file/32216025

def save_list_or_dict_to_pkl(obj, obj_name):
    pkl_output_file_name = obj_name + ".pkl"
    with open('obj/'+ pkl_output_file_name, 'wb') as f:
        pickle.dump(obj, f)
    return "saved " + pkl_output_file_name

def load_pkl_list_or_dict(obj_name):
    pkl_file_name = obj_name + ".pkl"
    with open('obj/' + pkl_file_name, 'rb') as f:
        return pickle.load(f)

# Network to DGLGraph

In [34]:
edges_df = pd.read_csv(os.path.join(path, 'facebook_large/musae_facebook_edges.csv'))
G_facebook = nx.Graph()
G_facebook = nx.from_pandas_edgelist(edges_df, 'id_1', 'id_2')
digraph = nx.DiGraph(G_facebook)
K = dgl.DGLGraph()
K.from_networkx(digraph)
dgl_g = [K]
with open(path + '/graph_dgl.pkl', 'wb') as f:
    pickle.dump(dgl_g, f)



# Rename labels

In [19]:
path = '/home/sam/neu/gml/data/facebook'
labels_df = pd.read_csv(os.path.join(path, 'facebook_large/musae_facebook_target.csv'))
labels_key = ['tvshow', 'government', 'company', 'politician']
labels_df['page_type_int'] = labels_df['page_type'].apply(lambda x: labels_key.index(x))
labels_df['subgraph_node'] = labels_df['id'].apply(lambda x: '0_'+str(x))
labels_dict = dict(zip(labels_df['subgraph_node'], labels_df['page_type_int']))
with open(path + '/label.pkl', 'wb') as f:
    pickle.dump(labels_dict, f)

Unnamed: 0,id,facebook_id,page_name,page_type,page_type_int
0,0,145647315578475,The Voice of China 中国好声音,tvshow,0
1,1,191483281412,U.S. Consulate General Mumbai,government,1
2,2,144761358898518,ESET,company,2
3,3,568700043198473,Consulate General of Switzerland in Montreal,government,1
4,4,1408935539376139,Mark Bailey MP - Labor for Miller,politician,3


# Create network features

In [67]:
# Create bag o words feature vector
stops = stopwords.words('english')
def text_preprocess(text:str):
    # Ignoring case
    text = text.lower()
    # Ignoring punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Ignoring frequent words that don’t contain much information, called stop words, like “a,” “of,” etc.
    text = text.split(' ')
    text = [word for word in text if not word in stops]
    text = ' '.join(text)
    return text


In [114]:
clean_title_text = labels_df['page_name'].apply(text_preprocess)
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 256)
# fit_transform() does two functions: First, it fits the model
# and learns the vocaulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(clean_title_text)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()
train_data_features_normed = normalize(train_data_features)
print('Bag of words completed')
labels_df['vector'] = list(train_data_features_normed)


features = np.array([np.array(x) for x in labels_df['vector']])
np.save(path + '/features.npy', features)

Bag of words completed


# Create train.csv, test.csv, and val.csv

In [43]:
for l in labels_df['page_type'].unique():
    print(l, len(labels_df[labels_df['page_type'] == l]))

tvshow 3327
government 6880
company 6495
politician 5768


In [52]:
df = pd.DataFrame.from_dict(labels_dict, orient='index').reset_index().rename(columns={"index": "name", 0: "label"})

train_label = [2, 3]
other_label = [0, 1]

other_df = df[df['label'].isin(other_label)]
# First half
val_df = other_df.iloc[: int(len(other_df)/2)]
# Second half
test_df = other_df.iloc[int(len(other_df)/2) :]
train_df = df[df['label'].isin(train_label)]

train_df.reset_index(drop = True).to_csv(path + '/train.csv')
val_df.reset_index(drop = True).to_csv(path + '/val.csv')
test_df.reset_index(drop = True).to_csv(path + '/test.csv')

In [45]:

val_df

Unnamed: 0,name,label
0,0_0,0
1,0_1,1
3,0_3,1
8,0_8,1
9,0_9,1
...,...,...
22456,0_22456,1
22460,0_22460,1
22463,0_22463,1
22467,0_22467,1
