In [76]:
import gensim
import pandas
import nltk

import os

dataDir = 'data'
modelsDir = 'models'

littleFile = 'little.tsv'
someFile = 'some.tsv'
mostlyFile = 'mostly.tsv'

w2vFname = 'word2vec.bin'

regenW2V = False

def tokenizer(target):
    return [t for t in nltk.word_tokenize(target.lower()) if t != '.']

def sentinizer(sent):
    return [tokenizer(s) for s in nltk.sent_tokenize(sent)]

def genVecSeq(target, model):
    tokens = tokenizer(target)
    vecs = []
    for t in tokens:
        try:
            vecs.append(model.wv[t])
        except KeyError:
            #print(t)
            pass
    return vecs

def genWord2Vec(*dfs):
    vocab = []
    for df in dfs:
        vocab += list(df['title'].apply(lambda x: x.lower().split()))
        vocab += df['abstract'].apply(sentinizer).sum()

    model = gensim.models.Word2Vec(vocab,
        hs = 1, #Hierarchical softmax is better for infrequent words
        size = 200, #Dim
        window = 5, #Might want to increase this
        min_count = 0,
        max_vocab_size = None,
        workers = 8, #My machine has 8 hyperthreads
        )
    return model

os.makedirs(dataDir, exist_ok = True)
os.makedirs(modelsDir, exist_ok = True)

dfs = {
    'little' : pandas.read_csv('data/little.tsv', sep='\t'),
    'some' : pandas.read_csv('data/some.tsv', sep='\t'),
    'most' : pandas.read_csv('data/mostly.tsv', sep='\t'),
}

if regenW2V:
    print("Generating Word2Vec")
    w2v = genWord2Vec(*dfs.values())
    w2v.save('{}/{}'.format(modelsDir, w2vFname))
else:
    w2v = gensim.models.Word2Vec.load('{}/{}'.format(modelsDir, w2vFname))
print(w2v)
for name, df in dfs.items():
    print("Generating vecs for: {}".format(name))
    df['title_tokenize'] = df['title'].apply(lambda x : genVecSeq(x, w2v))
    df['abstract_tokenize'] = df['abstract'].apply(lambda x : genVecSeq(x, w2v))

#print(dfs['little'])
df = dfs['little']

Word2Vec(vocab=86676, size=200, alpha=0.025)
Generating vecs for: little
Generating vecs for: some
Generating vecs for: most


In [79]:
import torch 
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable


# Hyper Parameters
sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.003

# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/',
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data/',
                           train=False, 
                           transform=transforms.ToTensor())

# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)

In [84]:
i, l = next(train_loader.__iter__())

In [94]:
i.view(-1, sequence_length, input_size)


( 0 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...         

In [116]:
m = Variable(i[0].view(-1, sequence_length, input_size))

In [117]:
m.data.size()

torch.Size([1, 28, 28])

In [102]:
rnn(Variable(i[0].view(-1, sequence_length, input_size)))

Variable containing:
1.00000e-02 *
  8.8333  6.6464
[torch.FloatTensor of size 1x2]

In [171]:
import neuralnet

eta = 0.001
def trainModel(dfPostive, dfNegative):
    dfPostive['vals'] = [np.array([1]) for i in range(len(dfPostive))]
    dfNegative['vals'] = [np.array([0]) for i in range(len(dfNegative))]

    df = dfPostive.append(dfNegative, ignore_index=True)

    from sklearn.utils import shuffle
    df = shuffle(df)


    N = neuralnet.BiRNN(200, 128, 2)
    N.cuda()

    #criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=eta)

    for i in range(100):
        row = np.random.np.random.randint(0, len(df))

        xVec = Variable(torch.from_numpy(np.stack(df['abstract_tokenize'][row])).unsqueeze(0)).cuda()

        yVec = Variable(torch.from_numpy(df['vals'][row])).cuda()
        #print(yVec.data)
            
        optimizer.zero_grad()
        outputs = N(xVec)
        #print(outputs)
        loss = torch.nn.functional.cross_entropy(outputs, yVec)
        loss.backward()
        optimizer.step()
        print(loss.data[0])

In [172]:
trainModel(dfs['little'], dfs['most'])

KeyError: 'torch.FloatTensor'

In [129]:
torch.from_numpy(np.stack(vecs[0])).unsqueeze(0)


( 0 ,.,.) = 
  9.1753e-02  1.5142e-01 -5.8268e-01  ...   6.3124e-01  2.1273e-01 -4.8980e-01
  1.0616e+00  8.1551e-02 -3.5336e-03  ...   5.2598e-01 -1.6134e-01  4.7960e-01
 -1.7612e-01  5.1657e-01  7.1201e-02  ...   1.7485e+00 -8.2418e-02  8.3827e-01
                 ...                   ⋱                   ...                
 -6.4270e-01 -2.1115e+00 -4.4894e-02  ...   1.2405e+00 -7.2301e-01 -3.2357e-01
  1.0127e+00 -1.1702e+00  1.8223e+00  ...   8.2142e-01 -4.4484e-01  3.1254e-01
  7.3804e-01 -4.6319e-01  1.4026e-01  ...  -1.3889e+00  6.9481e-01 -2.1855e+00
[torch.FloatTensor of size 1x235x200]

In [144]:
Variable(torch.from_numpy(np.stack(df['abstract_tokenize'][100])).unsqueeze(0))

Variable containing:
( 0 ,.,.) = 
 -0.6951  0.6775  0.0112  ...   0.3588 -1.0843 -1.9157
  0.3185 -1.2010 -1.8983  ...  -1.0472 -1.3813  1.3460
  1.0616  0.0816 -0.0035  ...   0.5260 -0.1613  0.4796
           ...             ⋱             ...          
 -0.7610 -0.1022  0.9891  ...  -0.0507  0.0101  1.0804
  0.8129 -0.5254  0.4741  ...   1.4561 -1.5147 -1.1715
  0.1559 -0.4288  0.7449  ...   0.5813  0.2296  1.4754
[torch.FloatTensor of size 1x43x200]

In [130]:
v = Variable(torch.from_numpy(np.stack(vecs[0])).unsqueeze(0))

In [131]:
v.data.size()

torch.Size([1, 235, 200])

In [137]:
from sklearn.utils import shuffle


In [142]:
np.random.randint(0, 100)

60

In [135]:
df.append(dfs['some'], ignore_index=True)

Unnamed: 0,id,wos_id,accession_no,issn,issn_int,eissn,isbn,eisbn,art_no,doi,...,pubday,language,source,page_range,page_count,has_abstract,wos_id.1,abstract,title_tokenize,abstract_tokenize
0,7424,WOS:000274469400009,554XK,1436-3240,1436324,,,,,10.1007/s00477-009-0331-1,...,1,English,STOCHASTIC ENVIRONMENTAL RESEARCH AND RISK ASS...,425-444,20,1,WOS:000274469400009,This study aims to model the joint probability...,"[[0.0236096, 0.0032635, -0.0518586, -0.121607,...","[[2.77366, 0.652774, -0.440765, 0.29811, 0.181..."
1,10882,WOS:000279040500008,614FJ,0040-1706,40170,1537-2723,,,,10.1198/TECH.2010.08093,...,1,English,TECHNOMETRICS,231-242,12,1,WOS:000279040500008,Motivated by two industrial experiments in whi...,"[[1.16123, 1.63286, 0.34318, 0.380847, 0.25513...","[[1.11575, 0.482034, -0.0183253, -0.291243, 0...."
2,18436,WOS:000290231500005,759HO,0090-5364,90536,,,,,10.1214/09-AOS786,...,1,English,ANNALS OF STATISTICS,3458-3486,29,1,WOS:000290231500005,We demonstrate that the processes underlying o...,"[[-1.0149, 0.822729, 0.662489, -1.04985, 0.319...","[[-0.695066, 0.677472, 0.0111698, 0.209141, -0..."
3,18572,WOS:000274388800009,553TD,0012-9682,12968,,,,,10.3982/ECTA6131,...,1,English,ECONOMETRICA,285-308,24,1,WOS:000274388800009,This paper studies partnerships that employ a ...,"[[0.750476, 0.41266, 0.283005, -0.132385, -0.4...","[[2.77366, 0.652774, -0.440765, 0.29811, 0.181..."
4,24742,WOS:000280638400002,635EM,1369-7412,1369741,,,,,,...,1,English,JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIE...,405-416,12,1,WOS:000280638400002,I describe the background for the paper 'Contr...,"[[0.406431, -1.19213, -0.848863, -0.782029, 0....","[[0.624336, 0.337605, 0.939134, 0.760823, -0.7..."
5,32712,WOS:000275920200007,573NI,0040-1706,40170,,,,,10.1198/TECH.2009.08134,...,1,English,TECHNOMETRICS,67-79,13,1,WOS:000275920200007,Multivariate receptor modeling is used to esti...,"[[0.16723, -1.33395, -1.38288, -0.0678202, 0.1...","[[0.950022, 0.07348, -0.855757, 0.17769, 0.601..."
6,40209,WOS:000280359400009,631PH,0090-5364,90536,,,,,10.1214/09-AOS769,...,1,English,ANNALS OF STATISTICS,2187-2217,31,1,WOS:000280359400009,The paper considers nonparametric specificatio...,"[[-0.18386, -1.14652, 0.134341, 0.534507, 1.11...","[[1.06158, 0.0815509, -0.0035336, 0.283464, -0..."
7,40939,WOS:000282875200002,663HY,1369-7412,1369741,,,,,10.1111/j.1467-9868.2010.00742.x,...,1,English,JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIE...,609-630,22,1,WOS:000282875200002,Testing the equality of two survival distribut...,"[[0.666772, -0.210443, -0.0184195, 0.323417, 0...","[[0.0917527, 0.151416, -0.58268, -1.28097, -1...."
8,50946,WOS:000276508500002,581GB,0012-9682,12968,,,,,10.3982/ECTA8061,...,1,English,ECONOMETRICA,509-537,29,1,WOS:000276508500002,We examine the labor market effects of incompl...,"[[2.26076, 0.179787, 1.55793, 1.42612, 0.05444...","[[-0.695066, 0.677472, 0.0111698, 0.209141, -0..."
9,58883,WOS:000282402800011,657HC,0090-5364,90536,,,,,10.1214/10-AOS799,...,1,English,ANNALS OF STATISTICS,2916-2957,42,1,WOS:000282402800011,We present a new adaptive kernel density estim...,"[[-0.246178, 0.639709, 0.395433, 0.421159, -0....","[[-0.695066, 0.677472, 0.0111698, 0.209141, -0..."


In [132]:
rnn2(v)

Variable containing:
1.00000e-02 *
  6.0034 -3.0117
[torch.FloatTensor of size 1x2]

In [96]:
rnn2 = BiRNN(200, hidden_size, num_layers)


In [81]:
rnn = BiRNN(input_size, hidden_size, num_layers)
rnn

BiRNN (
  (lstm): LSTM(28, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear (256 -> 2)
)

In [56]:
vecs = sorted(df['abstract_tokenize'][:10], key=lambda x: len(x), reverse=True)
maxlen = max(len(v) for v in vecs)
result = np.zeros((len(vecs), maxlen, 200))
lens = []
for i, v in enumerate(vecs):
    unpad = np.stack(v)
    lens.append(unpad.shape[0])
    #print(unpad.shape)
    result[i, :unpad.shape[0],:unpad.shape[1]] = unpad

In [62]:
retT = Variable(torch.from_numpy(result))

In [77]:
#retT

In [66]:
padded = torch.nn.utils.rnn.pack_padded_sequence(retT, lens, batch_first=True)

In [78]:
#rnn(padded)

In [34]:
np.pad(np.stack(vecs[0]), (200, 300), 'constant').shape

(695, 700)

In [41]:
result = np.zeros((maxlen, 200))
unpad = np.stack(vecs[0])

In [42]:
result[:unpad.shape[0],:unpad.shape[1]] = unpad


In [46]:
result.shape

(10, 235, 200)

In [31]:
torch.nn.functional.pad(ten, 300)

NotImplementedError: Only 4D and 5D padding is supported for now

In [24]:
t = torch.zeros(10, maxlen, 200)

In [20]:
for i, v in enumerate(vecs):
    t[i] += torch.from_numpy(np.stack(v))

RuntimeError: inconsistent tensor size at /Users/Reid/Desktop/pytorch/torch/lib/TH/generic/THTensorMath.c:831

In [3]:
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable


# Hyper Parameters
sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.003

# MNIST Dataset
train_dataset = dsets.MNIST(root='./data/',
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data/',
                           train=False, 
                           transform=transforms.ToTensor())

In [42]:
t = train_loader.__iter__().__next__()

In [6]:
#t[0].view(-1, sequence_length, input_size)

In [39]:
# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)

In [4]:
import numpy as np
import torch.nn
import torch.nn.functional


import torch
import torch.nn as nn
#import torchvision.datasets as dsets
#import torchvision.transforms as transforms
from torch.autograd import Variable

class BiRNN(nn.Module):
    #Mostly from pytorch example
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 2)  # 2 for bidirection

    def forward(self, x):
        # Set initial states
        h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection
        c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))

        # Forward propagate RNN
        out, _ = self.lstm(x, (h0, c0))

        # Decode hidden state of last time step
        out = self.fc(out[:, -1, :])
        return out

In [5]:
rnn = BiRNN(input_size, hidden_size, num_layers)

In [54]:
var = Variable(t[0].view(-1, sequence_length, input_size))

RuntimeError: size '[-1 x 29 x 28]' is invalid for input of with 78400 elements at /Users/Reid/Desktop/pytorch/torch/lib/TH/THStorage.c:55

In [52]:
df = dfs['']

Variable containing:
1.00000e-02 *
 -4.8613  2.9809
 -4.6015  3.0765
 -4.5531  3.0600
 -4.7129  2.9618
 -4.9513  3.1830
 -4.4397  3.0848
 -4.5265  3.0862
 -4.5155  3.0755
 -4.5753  3.0152
 -4.6473  3.2008
 -4.4394  3.0768
 -4.5466  3.0591
 -4.3964  3.1822
 -4.4455  3.0576
 -4.4507  2.6725
 -4.6340  3.0555
 -4.6107  3.0134
 -4.3867  3.1557
 -4.5146  3.0258
 -4.3053  3.0793
 -4.4365  3.1178
 -4.5079  2.9617
 -4.4378  2.9990
 -4.4171  3.0907
 -4.3732  3.1572
 -4.6967  3.0112
 -4.5943  3.1336
 -4.5344  2.8000
 -4.6115  3.0256
 -4.7264  3.0784
 -4.5196  3.1054
 -4.2958  3.1021
 -4.2825  3.1628
 -4.5096  3.1085
 -4.3982  3.0940
 -4.5625  3.1059
 -4.8034  3.0256
 -4.5028  3.0618
 -4.3602  3.0877
 -4.3708  3.0836
 -4.5198  2.9863
 -4.6345  3.0136
 -4.5054  3.0775
 -4.5420  3.0830
 -4.5252  3.0393
 -4.5971  3.1681
 -4.3901  3.0506
 -4.3719  3.0904
 -4.5617  3.0351
 -4.3903  3.0761
 -4.3854  3.1220
 -4.4411  3.0520
 -4.6094  3.0832
 -4.6056  3.0931
 -4.6074  3.2343
 -4.4951  3.0924
 -4.7021  2.9

In [30]:
vec = np.stack(df['abstract_tokenize'][0])

In [34]:
var = torch.from_numpy(vec)

In [36]:
var = Variable(var)

In [37]:
RNN(var)

RuntimeError: matrices expected, got 1D, 2D tensors at /Users/Reid/Desktop/pytorch/torch/lib/TH/generic/THTensorMath.c:1232

In [23]:
df['abstract'][0]

"This study aims to model the joint probability distribution of periodic hydrologic data using meta-elliptical copulas. Monthly precipitation data from a gauging station (410120) in Texas, US, was used to illustrate parameter estimation and goodness-of-fit for univariate drought distributions using chi-square test, Kolmogorov-Smirnov test, Cramer-von Mises statistic, Anderson-Darling statistic, modified weighted Watson statistic, and Liao and Shimokawa statistic. Pearson's classical correlation coefficient r (n) , Spearman's rho (n), Kendall's tau, Chi-Plots, and K-Plots were employed to assess the dependence of drought variables. Several meta-elliptical copulas and Gumbel-Hougaard, Ali-Mikhail-Haq, Frank and Clayton copulas were tested to determine the best-fit copula. Based on the root mean square error and the Akaike information criterion, meta-Gaussian and t copulas gave a better fit. A bootstrap version based on Rosenblatt's transformation was employed to test the goodness-of-fit 

In [None]:
dfLittle = pandas.read_csv('data/little.tsv', sep='\t')
dfSome = pandas.read_csv('data/some.tsv', sep='\t')
dfMost = pandas.read_csv('data/mostly.tsv', sep='\t')

In [None]:
def tokenizer(target):
    return nltk.word_tokenize(target.lower())

In [None]:
def sentinizer(sent):
    try:
        return [s.split() for s in nltk.sent_tokenize(sent.lower())]
    except:
        print(sent)
        raise

In [None]:
def getVocab(df):
    vocabTitles = list(df['title'].apply(lambda x: x.lower().split()))
    vocabAbs = df['abstract'].apply(sentinizer).sum()
    return vocabTitles + vocabAbs

In [None]:
vocabTrain = getVocab(dfLittle) + getVocab(dfSome) + getVocab(dfMost)
model = gensim.models.Word2Vec(vocabTrain, size=200, window=5, min_count=1, workers=8)

In [None]:
nltk.word_tokenize(' the: quick brown')

In [None]:
gensim.models.Word2Vec.load('temp')

In [None]:
model.wv['test']

In [None]:
dfLittle[:10]['abstract'].apply(tokenizer)[0]

In [None]:
os.makedirs(exist_ok=)

In [None]:
model.save('temp')

In [None]:
model.wv['daks:']

In [None]:
vocabTrain[:1000]

In [None]:
dfLittle['abstract'].apply(lambda x: nltk.sent_tokenize(x.lower)).sum()

In [None]:
' '.join(vocabTrain[0])

In [None]:
[(i, s) for i, s in enumerate(vocabTrain) if len(s) < 1]

In [None]:
dfMost['abstract']

In [None]:
model.wv['we']

In [None]:
#!/usr/local/bin/python3

import metaknowledge as mk
import numpy as np
import pandas
import gensim
import nltk #For POS tagging
import sklearn #For generating some matrices
import pandas #For DataFrames
import numpy as np #For arrays
import matplotlib.pyplot as plt #For plotting
import seaborn #Makes the plots look nice
import IPython.display #For displaying images

import os #For looking through files
import os.path #For managing file paths
import re
import tarfile

mk.VERBOSE_MODE = False

#w2v = gensim.models.word2vec.Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True)

dataDir = 'data'
outputDir = 'outputs'

outputCSV = 'entries.csv'

targetTags = ['title', 'journal', 'keywords', 'abstract', 'id', 'year']

loadData = True

stop_words_nltk = nltk.corpus.stopwords.words('english')
snowball = nltk.stem.snowball.SnowballStemmer('english')

def normalizeTokens(tokenLst, stopwordLst = None, stemmer = None, lemmer = None, vocab = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)

    #And the lemmer
    if lemmer is not None:
        workingIter = (lemmer.lemmatize(w) for w in workingIter)

    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)

    #We will return a list with the stopwords removed
    if vocab is not None:
        vocab_str = '|'.join(vocab)
        workingIter = (w for w in workingIter if re.match(vocab_str, w))

    return list(workingIter)

def trainTestSplit(df, holdBackFraction = .2):
    df = df.reindex(np.random.permutation(df.index))
    holdBackIndex = int(holdBackFraction * len(df))
    train_data = df[holdBackIndex:].copy()
    test_data = df[:holdBackIndex].copy()

    return train_data, test_data

def generateVecs(df, sents = False):
    df['tokenized_text'] = df['text'].apply(lambda x: nltk.word_tokenize(x))
    df['normalized_text'] = df['tokenized_text'].apply(lambda x: normalizeTokens(x))

    if sents:
        df['tokenized_sents'] = df['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
        df['normalized_sents'] = df['tokenized_sents'].apply(lambda x: [normlizeTokens(s, stopwordLst = stop_words_nltk, stemmer = None) for s in x])

    ngCountVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, min_df=3, stop_words='english', norm='l2')
    newsgroupsVects = ngCountVectorizer.fit_transform([' '.join(l) for l in df['normalized_text']])
    df['vect'] = [np.array(v).flatten() for v in newsgroupsVects.todense()]

    return df

In [None]:

def main():
    if loadData:
        RC = mk.RecordCollection(dataDir)
        dfDict = {t : [] for t in targetTags}
        for R in RC:
            for t in targetTags:
                dfDict[t].append(R.get(t, None))
        df = pandas.DataFrame(dfDict)
        df.to_csv('{}/{}'.format(outputDir, outputCSV))
    else:
        df = pandas.read_csv('{}/{}'.format(outputDir, outputCSV))
    df['text'] = df['abstract']
    df = generateVecs(df)

if __name__ == '__main__':
    main()


In [None]:
df = pandas.read_csv('{}/{}'.format(outputDir, outputCSV))
df['text'] = df['abstract']
df = generateVecs(df.dropna().copy())

In [None]:
df['vect'][0].shape

In [None]:
df.dropna()