In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from collections import OrderedDict
from tqdm import tqdm
import math
import networkx as nx
import os
import pickle

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sanshrav1311/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def save_as_pickle(filename, data):
    with open(filename, 'wb') as output:
        pickle.dump(data, output)

In [3]:
def nCr(n,r):
    f = math.factorial
    return int(f(n)/(f(r)*f(n-r)))

In [4]:
def word_word_edges(p_ij):
    word_word = []
    cols = list(p_ij.columns); cols = [str(w) for w in cols]
    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
    return word_word

In [6]:
df=pd.read_csv('PubMedDataClean.csv', index_col=False)

In [7]:
df=df.sort_values('Contextual')

In [8]:
def generate_text_graph(df, columnName):
    word_set = set()
    for doc_words in df[columnName]:
        words = doc_words.split()
        for word in words:
            word_set.add(word)
    vocab = list(word_set)
    vocab_size = len(vocab)
    corpus_size=8833
    vocab_map = {}
    for i in range(vocab_size):
        vocab_map[vocab[i]] = i
        
    vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=nltk.word_tokenize,lowercase=False)
    vectorizer.fit(df[columnName])
    df_tfidf = vectorizer.transform(df[columnName])
    df_tfidf = df_tfidf.toarray()
    vocab = vectorizer.get_feature_names_out()
    vocab = np.array(vocab)
    df_tfidf = pd.DataFrame(df_tfidf,columns=vocab)
    
    names = vocab
    window = 10
    n_i  = OrderedDict((name, 0) for name in names)
    word2index = OrderedDict( (name,index) for index,name in enumerate(names) )

    occurrences = np.zeros( (len(names),len(names)) ,dtype=np.int32)
    no_windows = 0; 
    for l in tqdm(df[columnName], total=len(df[columnName])):
        k=nltk.word_tokenize(l)
        for i in range(len(k)-window):
            no_windows += 1
            d = set(k[i:(i+window)])

            for w in d:
                n_i[w] += 1
            for w1,w2 in combinations(d,2):
                i1 = word2index[w1]
                i2 = word2index[w2]

                occurrences[i1][i2] += 1
                occurrences[i2][i1] += 1
    p_ij = pd.DataFrame(occurrences, index = names,columns=names)/no_windows
    p_i = pd.Series(n_i, index=n_i.keys())/no_windows

    del occurrences
    del n_i
    for col in p_ij.columns:
        p_ij[col] = p_ij[col]/p_i[col]
    for row in p_ij.index:
        p_ij.loc[row,:] = p_ij.loc[row,:]/p_i[row]
    p_ij = p_ij + 1E-9
    for col in p_ij.columns:
        p_ij[col] = p_ij[col].apply(lambda x: math.log(x))
        
    G = nx.Graph()
    G.add_nodes_from(df_tfidf.index) ## document nodes
    G.add_nodes_from(vocab) ## word nodes
    ### build edges between document-word pairs
    document_word = [(doc,w,{"weight":df_tfidf.loc[doc,w]}) for doc in tqdm(df_tfidf.index, total=len(df_tfidf.index))\
                        for w in df_tfidf.columns]
    word_word = word_word_edges(p_ij)
    # save_as_pickle("word_word_edges_%s.pkl" % columnName, word_word)
    G.add_edges_from(document_word)
    G.add_edges_from(word_word)
    save_as_pickle("text_graph_%s.pkl" % columnName, G)

In [None]:
for column in ["TITLE_CLEAN", "KEYWORDS_CLEAN", "ABSTRACT_CLEAN"]:
    generate_text_graph(df = df, columnName = column)

100%|█████████████████████████████████████| 8833/8833 [00:04<00:00, 1972.92it/s]
