In [1]:
#-- read data
# http://mlg.ucd.ie/datasets/bbc.html
import pandas as pd
import numpy as np
data = pd.read_feather("data/raw_text_DF.feather")
data["category"] = pd.Categorical(data["category"])
print(data.shape)
X = data["text"]
Y = pd.get_dummies(data["category"])


(2225, 3)


In [2]:
#-- create graph from text
from nltk import word_tokenize 
from nltk.util import ngrams
import nltk
import networkx as nx 
import pandas as pd 
nltk.download('punkt')

def text2graph(text):
    #-- pre-processing text
    # lematization
    # name entity recognition
    # insertion of correct entity names
    # split text into tokens
    token = word_tokenize(text)
    # extract bigrams
    bigram = list(ngrams(token, 2)) 
    # skip bigrams with words having only one sign
    cor_bigrams = [(a,b) for a,b in bigram if len(a)>1 and len(b)>1]
    # create graph
    edge_dataFrame = pd.DataFrame(cor_bigrams)
    G = nx.from_pandas_edgelist(edge_dataFrame,0,1) 
    return G

[nltk_data] Downloading package punkt to /home/robert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#-- graph visualization with matplotlib
from networkx.drawing.nx_agraph import graphviz_layout
_, V, G = text2graph(X[1])
pos = graphviz_layout(G)
nx.draw_networkx(G,pos)

#-- graph visualization with HoloViews
import holoviews as hv
import networkx as nx
import numpy as np
from holoviews import opts
hv.extension("bokeh")

defaults = dict(width=800, height=400, padding=0.1)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

hv_graph = hv.Graph.from_networkx(G, pos).opts(tools=['hover'])
labels = hv.Labels(hv_graph.nodes, ['x', 'y'], 'index')

(hv_graph * labels.opts(text_font_size='8pt', text_color='white', bgcolor='grey'))

In [3]:
#-- feature matrix
from gensim.models import KeyedVectors, Word2Vec
def featureMat(NodeLabels):
    # Load vectors directly from the file
    model = KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin.gz', binary=True)
    #model = Word2Vec.load('models/simple_word2vec.model')
    # Access vectors for specific words with a keyed lookup:
    
    X_list = list()
    for x in NodeLabels:
        if x in model.wv.vocab:
            X_list.append(model[x])
        else:
            X_list.append(np.random.uniform(0,1,model.wv.vector_size))
    
    X_df = pd.DataFrame(X_list)
    return X_df


In [48]:
#-- SAVE adj, features, label
import pickle
from datetime import datetime as dt
now = dt.now() 

all_words = []

for ind in range(X.shape[0]):
    
    file = open("data/txt_graph"+ now.strftime("%H%M_%d%m%Y") + "_" + str(ind) + ".pkl",'wb')
    
    Graph = text2graph(X[ind])
    all_words.append(list(Graph.nodes))
    #featureMat_DF = featureMat(list(Graph.nodes))
    pickle.dump(Graph, file)
    #pickle.dump(featureMat_DF, file)
    pickle.dump(Y.iloc[ind], file)
    file.close()
print("All files saved")


All files saved


In [49]:
import itertools

all_words = list(np.unique(list(itertools.chain.from_iterable(all_words))))

features = featureMat(all_words)
features_df = pd.DataFrame(features)
features_df = features_df.set_index(pd.Index(all_words))
file = open("data/txt_features"+ now.strftime("%H%M_%d%m%Y") + "_" + str(ind) + ".pkl",'wb')

pickle.dump(features_df, file)

file.close()

  if x in model.wv.vocab:
  X_list.append(np.random.uniform(0,1,model.wv.vector_size))


In [50]:
features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
'',0.134238,0.614712,0.229754,0.787038,0.738351,0.622969,0.561182,0.093135,0.794703,0.647377,...,0.363498,0.686550,0.290047,0.435609,0.464822,0.371547,0.578090,0.292221,0.200488,0.335411
''Even,0.079450,0.850494,0.551948,0.178366,0.390620,0.712151,0.241915,0.688053,0.869095,0.987329,...,0.429693,0.993799,0.366470,0.458229,0.320827,0.370254,0.387786,0.026508,0.006633,0.562966
''He,0.715784,0.873069,0.402127,0.014309,0.964218,0.923781,0.176768,0.073993,0.770096,0.795799,...,0.900870,0.654848,0.539773,0.672799,0.919793,0.742262,0.285095,0.599295,0.061122,0.946930
''In,0.154379,0.255390,0.931025,0.851989,0.639176,0.468131,0.596269,0.521741,0.523098,0.236744,...,0.772684,0.136805,0.444482,0.799542,0.592306,0.554817,0.681949,0.886949,0.908571,0.269454
''It,0.837654,0.080038,0.460388,0.600121,0.176164,0.041295,0.442117,0.915124,0.365624,0.444456,...,0.848127,0.578103,0.557958,0.947494,0.645317,0.103456,0.834517,0.859539,0.333570,0.966699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
£947m,0.716746,0.786566,0.116657,0.459093,0.242848,0.945598,0.604611,0.745379,0.932881,0.109141,...,0.452819,0.140867,0.184504,0.159926,0.270947,0.876330,0.278408,0.803102,0.252672,0.449429
£960m,0.525947,0.618147,0.158790,0.348821,0.325491,0.896572,0.432771,0.250209,0.516744,0.249320,...,0.658306,0.568092,0.072483,0.797568,0.066129,0.526903,0.122358,0.296979,0.846190,0.798663
£99,0.445855,0.645774,0.075657,0.461442,0.600318,0.634374,0.103850,0.248687,0.742835,0.163335,...,0.076406,0.653440,0.386991,0.610227,0.090448,0.297715,0.000125,0.698327,0.572891,0.615582
£9m,0.439667,0.893191,0.385765,0.725628,0.010089,0.805906,0.149038,0.513043,0.656461,0.262018,...,0.721255,0.581445,0.745674,0.456768,0.903641,0.565707,0.007451,0.172798,0.052545,0.378954
