## Texts to one huge graph

In [1]:
#-- read data
# http://mlg.ucd.ie/datasets/bbc.html
import itertools
import pandas as pd
import numpy as np
import networkx as nx 
import sys
import scipy
#-- create graph from text
import nltk
from nltk import word_tokenize 
from nltk.util import ngrams
nltk.download('punkt')

data = pd.read_feather("../data/raw_text_DF.feather")
data["category"] = pd.Categorical(data["category"])
print(data.shape)
X = data["text"]
X_tokenized = [word_tokenize(text.lower()) for text in X]

Y = pd.get_dummies(data["category"])


[nltk_data] Downloading package punkt to /home/robert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(2225, 3)


In [2]:
Y.iloc[2093,:]

business         0
entertainment    1
politics         0
sport            0
tech             0
Name: 2093, dtype: uint8

## Create word-word matrix

In [3]:
def word_word_matrix(tokenized_texts, words_order):
    unigram_freq = pd.value_counts(list(itertools.chain.from_iterable(tokenized_texts)))
    # unigram_freq = unigram_freq[:10]
    unigram_prob = unigram_freq /float(sum(unigram_freq))
    unigram_prob_matrix = np.matmul(np.expand_dims(unigram_prob, 1), np.expand_dims(unigram_prob, 1).T)

    bigrams = [list(ngrams(text, 2)) for text in tokenized_texts]
    bigrams = list(itertools.chain.from_iterable(bigrams))

    bigram_freq = pd.value_counts(bigrams)

    bigram_freq_set = {}
    for a,b in zip(bigram_freq.index, bigram_freq):
        bigram_freq_set[a] = int(b)


    bigram_matrix = np.zeros((len(words_order),len(words_order)))

    for i in range(len(words_order)):
        word1 = words_order[i]
        for j in range(i+1, len(words_order)):
            word2 = words_order[j]
            try: 
                bi_coo1 = bigram_freq_set[(word1, word2)] 
            except:
                bi_coo1 = 0
            try:    
                bi_coo2 = bigram_freq_set[(word2, word1)] + bi_coo1
            except:
                bi_coo2 = 0 + bi_coo1

            bigram_matrix[i,j] = bi_coo2
            bigram_matrix[j,i] = bi_coo2

    bigram__prob_matrix = bigram_matrix / float(sum(bigram_freq)) 

    adj_matrix = np.log(bigram__prob_matrix / unigram_prob_matrix)
    adj_matrix[adj_matrix < 0] = 0
    adj_matrix
    return(adj_matrix)

In [4]:
def freq_words(tokenized_texts):
    unique_freq_tokens = pd.value_counts(list(itertools.chain.from_iterable(tokenized_texts)))
    unique_freq_tokens = unique_freq_tokens[unique_freq_tokens>10][10:]
    unique_freq_tokens = list(unique_freq_tokens.index)
    unique_freq_tokens = [t for t in unique_freq_tokens if t.isalnum()]
    return(unique_freq_tokens)

In [5]:
useful_tokens = freq_words(X_tokenized)
X_tokenized_filtered = []
for text in X_tokenized: X_tokenized_filtered.append([t for t in text if t in useful_tokens])
word_word_adj = word_word_matrix(X_tokenized_filtered, useful_tokens)

  adj_matrix = np.log(bigram__prob_matrix / unigram_prob_matrix)


In [6]:
for i in range(word_word_adj.shape[0]):
    word_word_adj[i,i] = 0 

## Create word-document matrix

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(vocabulary=useful_tokens)
filtered_text = [" ".join(text) for text in X_tokenized_filtered]
doc_word_adj = tfidf.fit_transform(filtered_text)
word_doc_adj = doc_word_adj.T

## Create document-document matrix

In [8]:
doc_doc_adj = np.zeros((Y.shape[0],Y.shape[0]))

## Create Graph Adjacency matrix

In [9]:
# word_word_adj word_doc_adj 
# doc_word_adj doc_doc_adj
col1 = np.row_stack((word_word_adj, doc_word_adj.todense()))
col2 = np.row_stack((word_doc_adj.todense(), doc_doc_adj))
graph_adj = np.column_stack((col1,col2))
graph_adj_df = pd.DataFrame(graph_adj, columns= useful_tokens + list(range(Y.shape[0])), index = useful_tokens + list(range(Y.shape[0])))

In [10]:
Graph = nx.from_pandas_adjacency(graph_adj_df)

In [11]:
nx.write_graphml(Graph, "./Textgraph.graphml")

In [12]:
import pickle
from datetime import datetime as dt
now = dt.now() 
file = open("./"+ now.strftime("%H%M_%d%m%Y") + "_" + str("global_adj_graph") + ".pkl",'wb')

pickle.dump(Graph, file)
#pickle.dump(featureMat_DF, file)
pickle.dump(Y, file)
file.close()