In [124]:
#-- read data
# http://mlg.ucd.ie/datasets/bbc.html
import pandas as pd
import numpy as np
data = pd.read_feather("data/raw_text_DF.feather")
data["category"] = pd.Categorical(data["category"])
print(data.shape)
X = data["text"]
Y = pd.get_dummies(data["category"])


(2225, 3)


In [125]:
#-- create graph from text
from nltk import word_tokenize 
from nltk.util import ngrams
import networkx as nx 
import pandas as pd 

def text2graph(text):
    #-- pre-processing text
    # lematization
    # name entity recognition
    # insertion of correct entity names
    # split text into tokens
    token = word_tokenize(text)
    # extract bigrams
    bigram = list(ngrams(token, 2)) 
    # skip bigrams with words having only one sign
    cor_bigrams = [(a,b) for a,b in bigram if len(a)>1 and len(b)>1]
    # create graph
    df_graph = pd.DataFrame(cor_bigrams)
    G = nx.from_pandas_edgelist(df_graph,0,1) 
    A = nx.adjacency_matrix(G, list(G.nodes))
    return (A, list(G.nodes), G)


#-- graph visualization with matplotlib
from networkx.drawing.nx_agraph import graphviz_layout
_, V, G = text2graph(X[1])
pos = graphviz_layout(G)
nx.draw_networkx(G,pos)

#-- graph visualization with HoloViews
import holoviews as hv
import networkx as nx
import numpy as np
from holoviews import opts
hv.extension("bokeh")

defaults = dict(width=800, height=400, padding=0.1)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

hv_graph = hv.Graph.from_networkx(G, pos).opts(tools=['hover'])
labels = hv.Labels(hv_graph.nodes, ['x', 'y'], 'index')

(hv_graph * labels.opts(text_font_size='8pt', text_color='white', bgcolor='grey'))

In [126]:
#-- feature matrix
from gensim.models import KeyedVectors, Word2Vec
def featureMat(NodeLabels):
    # Load vectors directly from the file
    #model = KeyedVectors.load_word2vec_format('/home/robertb/python/GCN_text_classif/GoogleNews-vectors-negative300.bin', binary=True)
    model = Word2Vec.load('models/simple_word2vec.model')
    # Access vectors for specific words with a keyed lookup:
    
    X_list = list()
    for x in NodeLabels:
        if x in model.wv.vocab:
            X_list.append(model[x])
        else:
            X_list.append(np.random.uniform(0,1,model.wv.vector_size))
    
    X_df = pd.DataFrame(X_list)
    return X_df


In [None]:
#-- SAVE adj, features, label
import pickle
from datetime import datetime as dt
now = dt.now() 

for ind in range(X.shape[0]):
    
    file = open("data/txt_graph"+ now.strftime("%H%M_%d%m%Y") + "_" + str(ind) + ".pkl",'wb')
    
    Adj, vertex_labels, Graph = text2graph(X[ind])
    featureMat_DF = featureMat(vertex_labels)
    pickle.dump(Adj, file)
    pickle.dump(featureMat_DF, file)
    pickle.dump(Y.iloc[ind], file)
    file.close()
print("All files saved")



































In [None]:
X.shape[0]