In [3]:
#!/usr/local/bin/python3

import metaknowledge as mk
import numpy as np
import pandas
import gensim
import nltk #For POS tagging
import sklearn #For generating some matrices
import pandas #For DataFrames
import numpy as np #For arrays
import matplotlib.pyplot as plt #For plotting
import seaborn #Makes the plots look nice
import IPython.display #For displaying images

import os #For looking through files
import os.path #For managing file paths
import re
import tarfile

mk.VERBOSE_MODE = False

#w2v = gensim.models.word2vec.Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True)

dataDir = 'data'
outputDir = 'outputs'

outputCSV = 'entries.csv'

targetTags = ['title', 'journal', 'keywords', 'abstract', 'id', 'year']

loadData = True

stop_words_nltk = nltk.corpus.stopwords.words('english')
snowball = nltk.stem.snowball.SnowballStemmer('english')

def normalizeTokens(tokenLst, stopwordLst = None, stemmer = None, lemmer = None, vocab = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)

    #And the lemmer
    if lemmer is not None:
        workingIter = (lemmer.lemmatize(w) for w in workingIter)

    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)

    #We will return a list with the stopwords removed
    if vocab is not None:
        vocab_str = '|'.join(vocab)
        workingIter = (w for w in workingIter if re.match(vocab_str, w))

    return list(workingIter)

def trainTestSplit(df, holdBackFraction = .2):
    df = df.reindex(np.random.permutation(df.index))
    holdBackIndex = int(holdBackFraction * len(df))
    train_data = df[holdBackIndex:].copy()
    test_data = df[:holdBackIndex].copy()

    return train_data, test_data

def generateVecs(df, sents = False):
    df['tokenized_text'] = df['text'].apply(lambda x: nltk.word_tokenize(x))
    df['normalized_text'] = df['tokenized_text'].apply(lambda x: normalizeTokens(x))

    if sents:
        df['tokenized_sents'] = df['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
        df['normalized_sents'] = df['tokenized_sents'].apply(lambda x: [normlizeTokens(s, stopwordLst = stop_words_nltk, stemmer = None) for s in x])

    ngCountVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, min_df=3, stop_words='english', norm='l2')
    newsgroupsVects = ngCountVectorizer.fit_transform([' '.join(l) for l in df['normalized_text']])
    df['vect'] = [np.array(v).flatten() for v in newsgroupsVects.todense()]

    return df

In [4]:

def main():
    if loadData:
        RC = mk.RecordCollection(dataDir)
        dfDict = {t : [] for t in targetTags}
        for R in RC:
            for t in targetTags:
                dfDict[t].append(R.get(t, None))
        df = pandas.DataFrame(dfDict)
        df.to_csv('{}/{}'.format(outputDir, outputCSV))
    else:
        df = pandas.read_csv('{}/{}'.format(outputDir, outputCSV))
    df['text'] = df['abstract']
    df = generateVecs(df)

if __name__ == '__main__':
    main()


TypeError: expected string or bytes-like object

In [10]:
df = pandas.read_csv('{}/{}'.format(outputDir, outputCSV))
df['text'] = df['abstract']
df = generateVecs(df.dropna().copy())

In [14]:
df['vect'][0].shape

(3831,)

In [7]:
df.dropna()

Unnamed: 0.1,Unnamed: 0,abstract,id,journal,keywords,title,year,text
0,0,We consider discrete-time observations of a co...,WOS:000291183300004,ANNALS OF STATISTICS,"['NONPARAMETRIC REGRESSION', 'MICROSTRUCTURE N...",ASYMPTOTIC EQUIVALENCE FOR INFERENCE ON THE VO...,2011,We consider discrete-time observations of a co...
1,1,"Many statistical analyses (e. g., in econometr...",WOS:000252431400001,JOURNAL OF STATISTICAL SOFTWARE,"['3-STAGE LEAST-SQUARES', 'REGRESSION EQUATION...",systemfit: A package for estimating systems of...,2007,"Many statistical analyses (e. g., in econometr..."
2,2,"It has been recently shown that, under the mar...",WOS:000248987600006,ANNALS OF STATISTICS,"['RISK BOUNDS', 'CONVERGENCE', 'CLASSIFICATION']",Fast learning rates for plug-in classifiers,2007,"It has been recently shown that, under the mar..."
3,3,Reference analysis produces objective Bayesian...,WOS:000265500500013,ANNALS OF STATISTICS,"['POSTERIOR DISTRIBUTIONS', 'MATHEMATICAL-THEO...",THE FORMAL DEFINITION OF REFERENCE PRIORS,2009,Reference analysis produces objective Bayesian...
4,4,We introduce the C++ application and R package...,WOS:000399022900001,JOURNAL OF STATISTICAL SOFTWARE,"['MACHINE LEARNING-METHODS', 'PROBABILITY ESTI...",ranger: A Fast Implementation of Random Forest...,2017,We introduce the C++ application and R package...
5,5,We introduce a pathwise algorithm for the Cox ...,WOS:000288204000001,JOURNAL OF STATISTICAL SOFTWARE,"['GENERALIZED LINEAR-MODELS', 'GENE-EXPRESSION...",Regularization Paths for Cox's Proportional Ha...,2011,We introduce a pathwise algorithm for the Cox ...
6,6,Semisupervised methods are techniques for usin...,WOS:000320488200013,ANNALS OF STATISTICS,['SAMPLES'],DENSITY-SENSITIVE SEMISUPERVISED INFERENCE,2013,Semisupervised methods are techniques for usin...
7,7,This paper describes the core features of the ...,WOS:000235180600001,JOURNAL OF STATISTICAL SOFTWARE,"['LINEAR-MODELS', 'REGRESSION-MODELS', 'INFERE...",The R Package geepack for Generalized Estimati...,2006,This paper describes the core features of the ...
8,8,Matrix completion and quantum tomography are t...,WOS:000327746100007,ANNALS OF STATISTICS,"['LOW-RANK MATRICES', 'HIGH-DIMENSIONAL MATRIC...",ASYMPTOTIC EQUIVALENCE OF QUANTUM STATE TOMOGR...,2013,Matrix completion and quantum tomography are t...
9,9,In a multiple testing problem where one is wil...,WOS:000253390000013,ANNALS OF STATISTICS,"['FAMILYWISE ERROR RATE', 'FALSE DISCOVERY RAT...",Generalizing Simes' test and Hochberg's stepup...,2008,In a multiple testing problem where one is wil...
