Take a variable word_data as input.  This is a list of text documents. Then tokenize, stem and TF-IDF vecorize.  Output should be a set of TF-IDF vectors for each DOI.

This all takes place in-memory.  It may be that this needs edited if we want to grow the dataset to a larger size.  For now, it should give better speed.

In [24]:
print('------------------------------------------------------')
print('Step 1:  Calculating TF-IDF data')
from datetime import datetime as dt
t_start = dt.now()
print(t_start)
print('------------------------------------------------------')

------------------------------------------------------
Step 1:  Calculating TF-IDF data
2018-02-16 13:16:36.531119
------------------------------------------------------


In [25]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import pickle
from bs4 import BeautifulSoup as bs
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

### Import configuration
Using a separate file config.py to create shared variables for the various notebooks in this project.

In [26]:
from config import Config as c

# inputs

years = c.years


word_datapath = c.word_datapath
tfidf_datapath = c.tfidf_datapath
labels_path = c.labels_path

vectorizer_datapath = c.vectorizer_datapath
filepaths_pkl = c.filepaths_pkl

# outputs
dois_pkl = c.dois_pkl
all_dois_pkl = c.all_dois_pkl
working_data = c.working_data

In [27]:
# stemming
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer(r'\w+')

def strip_stem(s): # removes punctuation and stems words
    s = tokenizer.tokenize(s)
    return ' '.join([stemmer.stem(word) for word in s])

In [28]:
# load word data and corresponding doi_ls
with open(word_datapath, 'rb') as f:
    word_data = pickle.load(f)
with open(dois_pkl, 'rb') as f:
    dois = pickle.load(f)

### Create the template dataframe for that will eventually take the data for visualisation

In [29]:
# import pandas as pd
# import numpy as np
data = pd.read_csv(working_data, index_col=0)
np.shape(data)

(7710, 11)

In [30]:
# DEFINE DOI_LS VARIABLE HERE
dois = list(data['DI'])

In [31]:
len(dois), len(word_data)

(7710, 7710)

In [32]:
# write to file
data.to_csv(working_data)
print('DataFrame initialised')

DataFrame initialised


### Calculate TF-IDF data

In [33]:
print('Starting TF-IDF indexing.')
# stopwords
from sklearn.feature_extraction import text

my_words = ['et','al','use','article','introduction','abstract','title', 'nan'] # note that there are 'NaN's in WoS data!

# Add custom stopwords here.  E.g. ['sensor','network','data','node']  are so common in DSN
# that they appear in almost every paper and make it hard to differentiate between clusters.  
custom_stops = ['4th']
my_words = my_words+custom_stops

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)
my_stop_words = set(my_stop_words)

Starting TF-IDF indexing.


In [34]:
# tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words=my_stop_words,
                             max_df = 0.99, # ignore the most frequent 1% of words
                            min_df = 2, # ignore words that appear in this many docs or fewer
                            ngram_range =(1,2)) # improves model for small datasets, but causes slow-down
tfidf = vectorizer.fit_transform(word_data)
shp = np.shape(tfidf)
print('Shape of TF-IDF matrix:', shp)
print('I.e. {} documents with {} unique words in them'.format(shp[0],shp[1]))

Shape of TF-IDF matrix: (7710, 84)
I.e. 7710 documents with 84 unique words in them


In [35]:
# cosine distances
sim = tfidf*tfidf.T
# cosine_sims = sim.todense() # this can be memory-hungry and isn't needed for the visualisation.
print('TF-IDF indexing complete. Writing to file.')

TF-IDF indexing complete. Writing to file.


In [36]:
pickle.dump(tfidf, open(tfidf_datapath,'wb'))
# pickle.dump(cosine_sims, open(cosine_sims_datapath,'wb'))
pickle.dump(vectorizer, open(vectorizer_datapath,'wb'))

In [37]:
print('Step 1 completed in ', dt.now()-t_start)

Step 1 completed in  0:00:17.742964
