In [3]:
import pandas
import torch
import pickle
import training
import utilities
import numpy as np
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
import nltk
import sklearn.metrics.pairwise

%matplotlib inline


dataDir = 'data'
modelsDir = 'models'

rawFname = 'combined.csv'
manualFname = None

w2vFname = 'word2vec.bin'
pickleFname = 'dfPickles.p'
regenW2V = False

eta = 0.001
numEpochs = 50
epochSize = 500

In [2]:
def compareRows(rows, N, useTitle = True, w2v = None):
    fig, axes = plt.subplots(figsize = (20,25),
                             nrows = len(rows) + 1,
                             gridspec_kw = {'height_ratios': [5] * len(rows) + [1]})
    aLst = []
    for i, row in enumerate(rows):
        if 'title_vecs' in row:
            abVec, tiVec, yVec = utilities.varsFromRow(row)
        else:
            abVec, tiVec, yVec = utilities.varsFromRow(row, w2v)
        if useTitle:
            outLSTM, (h_n, c_n) = N.lstmTi(tiVec)
            s = row['title']
        else:
            outLSTM, (h_n, c_n) = N.lstmAb(abVec)
            s = row['abstract']
        out = N(abVec, tiVec)
        probNeg = np.exp(out.data[0][0])
        probPos = np.exp(out.data[0][1])            
        probNeg = probNeg / (probNeg + probPos)
        probPos = probPos / (probNeg + probPos)
        
        a = np.array(outLSTM.data.tolist())
        aLst.append(a[0, -1, :])
        #a = a[:,:100,:]
        df = pandas.DataFrame(a[0, :, :])
        df.index = nltk.word_tokenize(s)[:a.shape[1]]
        seaborn.heatmap(df, ax = axes[i], label='big')
        axes[i].set_title("Article Title: '{}'\n$P_{{negative}} = {:.4f}, P_{{positive}} = {:.4f}$".format(row['title'], probNeg, probPos), fontsize = 20)
        axes[i].set_xticklabels([])
        
    
    dfDiff = pandas.DataFrame(np.stack([aLst[0], np.fabs(aLst[0] - aLst[1]), aLst[1]]))
    dfDiff.index = ['Top', 'Diff', 'Bottom']
    seaborn.heatmap(dfDiff, ax = axes[-1], xticklabels = [i if i in np.linspace(0, len(dfDiff.columns) - 1, num = 10, dtype='int') else '' for i in range(len(dfDiff.columns))])
    axes[-1].set_title('Difference in Final Output Vectors', fontsize = 20)
    
    return fig, axes

In [3]:
def wordDiff(df, N, useTitle = True, w2v = None):
    wDiffs = {}
    for i, (n, row) in enumerate(df.iterrows()):
        print("{:.0f}% Done".format(i / len(df) * 100), end = '\r')
        if 'title_vecs' in row:
            abVec, tiVec, yVec = utilities.varsFromRow(row)
        else:
            abVec, tiVec, yVec = utilities.varsFromRow(row, w2v)
        if useTitle:
            outLSTM, (h_n, c_n) = N.lstmTi(tiVec)
            s = row['title']
        else:
            outLSTM, (h_n, c_n) = N.lstmAb(abVec)
            s = row['abstract']
        if not isinstance(s, str):
            continue
        out = N(abVec, tiVec)
        a = np.array(outLSTM.data.tolist())[0]
        diffs = []
        for i in range(a.shape[0]):
            try:
                dU = 1 - sklearn.metrics.pairwise.cosine_similarity(a[i - 1].reshape(1, -1), a[i].reshape(1, -1))
            except IndexError:
                dU = 0
            try:
                dD = 1 - sklearn.metrics.pairwise.cosine_similarity(a[i + 1].reshape(1, -1), a[i].reshape(1, -1))
                if dU == 0:
                    dU = dD
            except IndexError:
                dD = dU
            diffs.append(np.mean([dU,dD]))
            
        for d, w in zip(diffs, nltk.word_tokenize(s.lower())):
            if w in wDiffs:
                wDiffs[w].append(d)
            else:
                wDiffs[w]= [d]
    return {w : np.mean(d) for w, d in wDiffs.items()}

In [4]:
def wordDiffPlot(row, N, useTitle = True, w2v = None):
    fig, ax = plt.subplots(figsize = (10,5))
    if 'title_vecs' in row:
        abVec, tiVec, yVec = utilities.varsFromRow(row)
    else:
        abVec, tiVec, yVec = utilities.varsFromRow(row, w2v)
    if useTitle:
        outLSTM, (h_n, c_n) = N.lstmTi(tiVec)
        s = row['title']
    else:
        outLSTM, (h_n, c_n) = N.lstmAb(abVec)
        s = row['abstract']
    out = N(abVec, tiVec)
    a = np.array(outLSTM.data.tolist())[0]
    diffs = []
    for i in range(a.shape[0]):
        try:
            dU = 1 - sklearn.metrics.pairwise.cosine_similarity(a[i - 1].reshape(1, -1), a[i].reshape(1, -1))
        except IndexError:
            dU = 0
        try:
            dD = 1 - sklearn.metrics.pairwise.cosine_similarity(a[i + 1].reshape(1, -1), a[i].reshape(1, -1))
            if dU == 0:
                dU = dD
        except IndexError:
            dD = dU
        diffs.append(np.mean([dU,dD]))
    dfDiffs = pandas.DataFrame({'diff' : diffs})
    #dfDiffs['loc'] = dfDiffs.index
    dfDiffs.index = nltk.word_tokenize(s)
    dfDiffs = dfDiffs#[1:-1]
    dfDiffs.plot(ax = ax)
    print("Done      ")
    return dfDiffs

In [5]:
with open("models/BiRNN-2-256-30.pt", 'rb') as f:
    N = torch.load(f)
N.cuda()

BiRNN-2-256-30

In [6]:
df, w2v = utilities.preprocesing(dataDir, rawFname, modelsDir, w2vFname, pickleFname)
dfTrain, dfTest = utilities.getTrainTest(df, dataDir, None, w2v)
df.index = df['eid']

Loading W2V
Loading DF
Generating training and testing sets
Generating word vectors


In [7]:
print('There are {} total records'.format(len(df)))

There are 20307 total records


In [8]:
if len(df) > 2000:
    df = df[df['class'] == 0].sample(1000).append(df[df['class'] == 1])
print('There are {} total records'.format(len(df)))

There are 1491 total records


In [13]:
dfY.to_csv('outputs/withYears.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'outputs/withYears.csv'

In [10]:
regen = False
if regen:
    catsDict = {
        'title' : [],
        'eid' : [],
        'abstract' : [],
        'weightP' : [],
        'weightN' : [],
    }

    tDF = df
    for i, (r_index, row) in enumerate(tDF.iterrows()):
        print("{:.3f}".format(i / len(tDF)), end = '\r')
        try:
            abVec, tiVec, yVec = utilities.varsFromRow(row, w2v)
            out = N(abVec, tiVec)
        except Exception as e:
            print(e)
            print(row['eid'])
            continue
        catsDict['weightN'].append(out.data[0][0])
        catsDict['weightP'].append(out.data[0][1])
        catsDict['title'].append(row['title'])
        catsDict['eid'].append(row['eid'])
        catsDict['abstract'].append(row['abstract'])
        #catsDict['source'].append(row['source'])
    dfY = pandas.DataFrame(catsDict)
    df
else:
    dfY = pandas.read_csv('outputs/withYears.csv', index_col='eid')
#dfY = dfY.drop(float('nan'))
dfY['isCSS'] = dfY['weightP'] > dfY['weightN']
dfY['is CSS'] = dfY['isCSS']

0.999

ValueError: labels [ nan] not contained in axis

In [None]:
dfYearCounts = dfY.groupby(['New software', 'pubyear'])['pubyear'].count()
dfYearCounts = pandas.DataFrame({'True' : dfYearCounts.loc[True], 'False' : dfYearCounts.loc[False],'tot' : df.groupby(['pubyear'])['pubyear'].count()})
dfYearCounts['ratio'] = dfYearCounts['True'] / dfYearCounts['tot']
dfYearCounts[['tot', 'False', 'True', 'ratio']]

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))
x_offset = -0.12

dfPlt = dfY.groupby(['New software', 'pubyear'])['pubyear'].count().unstack('New software')
dfPlt.plot(kind='bar', ax = ax)#, stacked=True)
ax.set_ylabel('Count', fontsize=16)
ax.set_xlabel('Year of Publication', fontsize=16)
for p in ax.patches:
    b = p.get_bbox()
    val = "{:.0f}".format(b.y1 + b.y0)        
    ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 +100))
ax.set_title('Year vs Number of Publications from each Class', fontsize=20)
#plt.savefig('images/countvyear.pdf', format='pdf')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))

df2 = dfY.groupby(['New software', 'source'])['source'].count().unstack('New software')
df2 = df2#[df2[True] > 0]
df2.index = ["{}\n{}".format(' '.join(s.split(' ')[:4]), ' '.join(s.split(' ')[4:])) for s in df2.index]
df2.sort_values(by=True,ascending=False).plot(kind='line', ax = ax, colormap= "RdBu")
ax.set_ylabel('Count', fontsize=16)
ax.set_xlabel('Publication', fontsize=16)
#ax.xticks = [s if i % 2 ==0 else '' for i, s in enumerate(df2.index)]
ax.set_title('Publication vs Number from Each Class', fontsize=20)
#plt.savefig('images/countvyear.pdf', format='pdf')
fig.autofmt_xdate()
ax.semilogy()
plt.savefig('images/countvpub.pdf', format='pdf', transparent = False)

plt.show()

In [None]:
tJ = [
    'STATISTICAL METHODS IN MEDICAL RESEARCH',
    'JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY',
    'ECONOMETRICA',
    'BRITISH JOURNAL OF MATHEMATICAL & STATISTICAL PSYCHOLOGY',
    'ANNUAL REVIEW OF STATISTICS AND ITS APPLICATION',
    'ANNALS OF STATISTICS',
    'STOCHASTIC ENVIRONMENTAL RESEARCH AND RISK ASSESSMENT',
    'TECHNOMETRICS',
]
dfSourceCounts = dfY.groupby(['isSoftware', 'source']).size()
dfSourceCounts.loc[False].sort_values(ascending=False)[tJ]#.plot()

In [None]:
r1 = df.loc['WOS:000272110900028']
r2 = df.loc['WOS:000280216700014']

In [None]:
dfY[dfY['isSoftware'] == False]

In [None]:

r1 = df.loc['WOS:000365978900001']
r2 = df.loc['WOS:000207446800001']

In [None]:
#row = df.loc['WOS:000272110900028']
print("r1")
print(r1['title'])
print()
print(r1['abstract'])
print(r1['pubyear'])
print("\nr2")
print(r2['title'])
print()
print(r2['abstract'])
print(r2['pubyear'])

In [None]:
compareRows([r1, r2],N, w2v = w2v)
plt.savefig('images/comparisonTitle.pdf', format  = 'pdf', transparent=True)
plt.show()

In [None]:
compareRows([r1, r2],N, useTitle=False, w2v = w2v)
plt.savefig('images/comparisonAbstract.pdf', format  = 'pdf', transparent=True)
plt.show()

In [None]:
dfDiffs= wordDiffPlot(r1, N, useTitle = False, w2v = w2v)
plt.show()
dfDiffs= wordDiffPlot(r2, N, useTitle = False, w2v = w2v)
plt.show()

In [None]:
langs = ['c', 'c++', 'python', 'stata', 'matlab', 'r', 'java', 'mathematica', 'sas', 'spss', 'javascript', 'perl']

In [None]:
idsPython = [w for w, a, t in zip(dfY.index, dfY['abstract'], dfY['title']) if 'python' in (t + ' ' + a).lower()]

In [None]:
df

In [None]:
langCounts = {}
for j, (i, row) in enumerate(df.iterrows()):
    print(j, end = '\r')
    tokens = row['abstract_tokens'] + row['title_tokens']
    try:
        if dfY.loc[i]['isSoftware']:
            for l in langs:
                if l in tokens:
                    try:
                        langCounts[l].append(i)
                    except KeyError:
                        langCounts[l] = [i]
    except KeyError:
        pass

In [None]:
dfC = dfY.loc[langCounts['c']]

In [None]:
index = []
count = []
for k, v in langCounts.items():
    index.append(k)
    count.append(len(v))

In [None]:
dfL = pandas.DataFrame({'count' : count}, index = [i.title() for i in index]).sort_values('count', ascending=False)

In [None]:
print(dfL.to_latex())

In [None]:
texNames = [
    ('Unnamed: 0', 'ID'),
    ('source' , 'Source'),
    ('pubyear' , 'Year of Publications'),
    ('title' , 'Title'),
    ('abstract' , 'Abstract'),
    ]

def rowToTex(row, cutoff = 70):
    print(r"""\begin{figure}[H]
	\begin{tabular}{ll}
		\toprule
		Field & Value\\
		\midrule""")
    for rN, tN in texNames:
        if len(str(row[rN])) < cutoff:
            print('\t\t{} & {} \\\\'.format(tN, row[rN]))
        else:
            s = str(row[rN])
            ts = s.split(' ')
            sOut = ['']
            while len(ts) > 0:
                subT = ts.pop(0)
                if len(sOut[-1] + ' ' + subT) < cutoff:
                    sOut[-1] += ' ' + subT
                else:
                    sOut.append(subT)
            print('\t\t{} & {} \\\\'.format(tN, '\\\\\n\t\t&'.join(sOut)))
    print(r"""		\bottomrule
	\end{tabular}
\end{figure}""")

In [None]:
len(w2v.wv.vocab)


In [None]:
rExample = dfY.loc['WOS:000341806800001']
rowToTex(rExample)

In [None]:
reversed

In [None]:
{k : len(v) for k, v in langCounts.items()}

In [None]:
dfY[dfY['source'] == 'JOURNAL OF STATISTICAL SOFTWARE'][dfY['isSoftware'] == False]

In [None]:
dfY.loc['WOS:000292681800006']['title']

In [None]:
wD = wordDiff(df[:100], N, useTitle = False, w2v = w2v)
indices = []
vals = []
for k, v in wD.items():
    indices.append(k)
    vals.append(v)

dfDiffs = pandas.DataFrame({'diff' : vals})
dfDiffs.index = indices
dfDiffs.sort_values('diff',ascending=False)

In [None]:
dfY

In [None]:
fig, ax = plt.subplots()
dfY.plot.scatter('weightP', 'weightN', ax = ax)
ax.set_ylabel('$log(P_{Negative})$')
ax.set_xlabel('$log(P_{Positive})$')
ax.set_title('Output log Probability of Negative vs Positive')
plt.savefig('images/weight.pdf', format = 'pdf')
plt.show()