Take a variable word_data as input.  This is a list of text documents. Then tokenize, stem and TF-IDF vecorize.  Output should be a set of TF-IDF vectors for each DOI.

This all takes place in-memory.  It may be that this needs edited if we want to grow the dataset to a larger size.  For now, it should give better speed.

In [1]:
print('------------------------------------------------------')
print('Step 1:  Calculating TF-IDF data')
from datetime import datetime as dt
t_start = dt.now()
print(t_start)
print('------------------------------------------------------')

------------------------------------------------------
Step 1:  Calculating TF-IDF data
2018-02-21 21:10:11.838369
------------------------------------------------------


In [2]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import pickle
from bs4 import BeautifulSoup as bs
import string

### Import configuration
Using a separate file config.py to create shared variables for the various notebooks in this project.

In [3]:
from config import Config as c

# inputs

years = c.years


word_datapath = c.word_datapath
tfidf_datapath = c.tfidf_datapath
labels_path = c.labels_path

vectorizer_datapath = c.vectorizer_datapath
filepaths_pkl = c.filepaths_pkl

# outputs
dois_pkl = c.dois_pkl
all_dois_pkl = c.all_dois_pkl
working_data = c.working_data
vocab_p = c.vocab_p
idf_p = c.idf_p

In [4]:
# load word data and corresponding doi_ls
with open(word_datapath, 'rb') as f:
    word_data = pickle.load(f)
with open(dois_pkl, 'rb') as f:
    dois = pickle.load(f)

### Create the template dataframe for that will eventually take the data for visualisation

In [5]:
# import pandas as pd
# import numpy as np
data = pd.read_csv(working_data, index_col=0)
np.shape(data)

(18355, 27)

In [6]:
# DEFINE DOI_LS VARIABLE HERE
dois = list(data['DI'])

In [7]:
len(dois), len(word_data)

(18355, 18355)

In [8]:
# write to file
data.to_csv(working_data)
print('DataFrame initialised')

DataFrame initialised


### Calculate TF-IDF data

In [9]:
print('Starting TF-IDF indexing.')
# stopwords
from sklearn.feature_extraction import text

my_words = ['et','al','use','article','introduction','abstract','title', 'nan'] # note that there are 'NaN's in WoS data!

# Add custom stopwords here.  E.g. ['sensor','network','data','node']  are so common in DSN
# that they appear in almost every paper and make it hard to differentiate between clusters.  
custom_stops = []
my_words = my_words+custom_stops

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)
my_stop_words = set(my_stop_words)

Starting TF-IDF indexing.


In [10]:

from tools import strip_stem
word_data = [strip_stem(doc) for doc in word_data]

In [11]:
# tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words=my_stop_words,
#                              max_df = 0.99, # ignore the most frequent 1% of words
#                             min_df = 2, # ignore words that appear in this many docs or fewer
#                             ngram_range =(1,2),# causes slow-down
                            use_idf = True,
                            max_features = 5000) # save some memory and reduce dimensionality.  
tfidf = vectorizer.fit_transform(word_data)
vocab = vectorizer.vocabulary_
idf = vectorizer.idf_
shp = np.shape(tfidf)
print('Shape of TF-IDF matrix:', shp)
print('I.e. {} documents with {} unique words indexed in them'.format(shp[0],shp[1]))

Shape of TF-IDF matrix: (18355, 5000)
I.e. 18355 documents with 5000 unique words in them


In [12]:
# vocab

In [13]:
# cosine distances
# sim = tfidf*tfidf.T
# cosine_sims = sim.todense() # this can be memory-hungry and isn't needed for the visualisation.
print('TF-IDF indexing complete. Writing to file.')

TF-IDF indexing complete. Writing to file.


In [14]:
print('Writing data to file')
pickle.dump(tfidf, open(tfidf_datapath,'wb'))
pickle.dump(vectorizer, open(vectorizer_datapath,'wb'))
pickle.dump(vocab, open(vocab_p,'wb'))
pickle.dump(idf, open(idf_p,'wb'))

Writing data to file


In [15]:
t =dt.now()
print('Done at ',t, 'in ',t-t_start)

Done at  2018-02-21 21:18:20.592957 in  0:08:08.754588
