# Online Resource 3 & 4
Filtering, vectorization, further filtering, umap-embedding, clustering and generation of the cluster-reports in markdown-format. Markdown-files can be concatenated and converted to .pdf using pandoc. 

In [1]:
import metaknowledge as mk
import pandas as pd
import numpy as np
from random import randint
import datetime

from scipy.sparse import coo_matrix, vstack
from scipy.sparse import csr_matrix
import scipy as scipy

%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

#For Tables:
from IPython.display import display
from IPython.display import Latex
pd.set_option('display.max_columns', 500)
import json

#For R (ggplot2)
%load_ext rpy2.ipython

# from sklearn.externals.joblib import Memory
# memory = Memory(cachedir='/tmp', verbose=0)
# @memory.cache

import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
from joblib import Parallel, delayed
import multiprocessing
#Embedding:
import umap
#Clustering:
import hdbscan

from itertools import count

#set up dictionary for survey:
import json

## Load the WOS-data

In [39]:
date_string = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M")

RC = mk.RecordCollection("wos_query.txt")

In [25]:
RC2 = mk.RecordCollection()

for R in RC:
#     print(R)

    try:
        R['year']
        if R['Z9']>=4:
            RC2.add(R) #Here we keep only records cited more then three times.
    except KeyError:
        pass


print(RC2.glimpse())
RC = RC2

RecordCollection glimpse made at: 2019-06-15 22:46:51
89467 Records from Empty

Top Authors
1 ANNAS, GJ
2 Savulescu, Julian
3 SHELAH, S
4 HINTIKKA, J
5 SOBER, E
5 LEWIS, D
6 Shelah, S
7 JACKSON, F
8 HACKING, I
8 Miller, Franklin G.
9 KITCHER, P
9 Harris, J
9 PETTIT, P
10 LOWE, EJ
11 NATSOULAS, T
11 Savulescu, J
12 Resnik, David B.
12 Walton, Douglas
12 Machery, Edouard
13 Nanay, Bence
13 VANFRAASSEN, BC

Top Journals
1 JOURNAL OF BUSINESS ETHICS
2 JOURNAL OF MEDICAL ETHICS
3 SYNTHESE
4 JOURNAL OF SYMBOLIC LOGIC
5 PHILOSOPHICAL STUDIES
6 PHILOSOPHY OF SCIENCE
7 CONSCIOUSNESS AND COGNITION
8 HASTINGS CENTER REPORT
9 JOURNAL OF PHILOSOPHY
10 PHILOSOPHY AND PHENOMENOLOGICAL RESEARCH
11 JOURNAL OF THE HISTORY OF IDEAS
12 AMERICAN JOURNAL OF BIOETHICS
13 ETHICS
14 BRITISH JOURNAL FOR THE PHILOSOPHY OF SCIENCE
15 MIND
16 ANALYSIS
17 CRITICAL INQUIRY
18 NOUS
19 SOCIAL RESEARCH
20 JOURNAL OF MEDICINE AND PHILOSOPHY
21 NURSING ETHICS

Top Cited
1 Rawls J, 1971, THEORY JUSTICE
2 Parfit D., 1984, 

## Extracting the Features

In [8]:
########### Cited Works - Features ############


drc = pd.DataFrame.from_dict(RC.forNLP(extraColumns=['journal','AU','FU','PD']))

def processInput(R):
    d = list(set(R.getCitations().get("citeString")))
    citedAU = list(set(R.getCitations().get("author")))
    return d, citedAU
 
num_cores = multiprocessing.cpu_count()


results = Parallel(n_jobs=num_cores)(delayed(processInput)(R) for R in RC)


d, citedAU = map(list, zip(*results))


drc["citedAU"] = citedAU
drc["citestring"] = d
authorslist = ['§'.join(filter(None,x)) for x in list(d)] 
vec = CountVectorizer(token_pattern=r'(?<=[§])[\s\w,\.:;\/\[\]-]+(?=[§])',binary=True, min_df = 3)


Xrc = vec.fit_transform(authorslist)

In [9]:
k = [Xrc]#XrcAu,
XrcFull = scipy.sparse.hstack(k).tocsr()

## Filtering papers to ensure connectedness

In [10]:
row_names = np.array(drc["id"])
num_cores = multiprocessing.cpu_count()

def filtersparse(x):
    row_idx, = np.where(row_names == drc["id"][x])
    if np.diff(XrcFull[row_idx].tocsr().indptr) >= 3:
        k = [XrcFull[row_idx]]
        newdf = drc.loc[x]
        return k, newdf
    


results = Parallel(n_jobs=1)(delayed(filtersparse)(x) for x in range(0,XrcFull.shape[0]))



In [11]:
results = [x for x in results if x is not None]
k, newdf = [e[0][0] for e in results], [e[1] for e in results]

In [1]:
filtered_out = drc[~drc['id'].isin(pd.DataFrame(newdf)['id'])]
filtered_out.to_csv('filtered_out.csv') #saving the dropped records for analysis

In [2]:
drc = pd.DataFrame(newdf).reset_index()
M = scipy.sparse.vstack(k)

## Conduct SVG

In [27]:
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=400, n_iter=7, random_state=42)

XSVD = SVD.fit_transform(M)
print(SVD.explained_variance_ratio_.sum())
dSVD = pd.DataFrame(XSVD)

sSVD = dSVD[[0,1]]
sSVD.columns = ['x','y']

0.11232364652506363


# [UMAP](https://github.com/lmcinnes/umap)-Embedding

In [33]:
embedding = umap.UMAP(n_neighbors = 50,#small => local, large => global: 5-50
                      min_dist = 0.17, #small => local, large => global: 0.001-0.5
                      spread = 1.7,
                      random_state = 42,
                      metric='cosine').fit_transform(XSVD)
embedding = pd.DataFrame(embedding)
embedding.columns = ['x','y']


In [38]:

embeddingL = umap.UMAP(n_components=30,
                        n_neighbors=50,
                      min_dist=0,
                        random_state = 42,
                      metric='cosine').fit(M).embedding_




## Clustering with HDBSCAN

In [3]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=300, min_samples=45, gen_min_span_tree=True)
clusterer.fit(embeddingL)
XCLUST = clusterer.labels_
clusternum = len(set( clusterer.labels_))-1

dfclust = pd.DataFrame(XCLUST)
dfclust.columns = ['cluster']

print(clusternum)

In [17]:
import pickle

with open('hdb.pickle', 'wb') as f:
    pickle.dump(clusterer, f) # save for plotting

## Generate the clustering-report

In [19]:

drc = pd.concat([drc, dfclust],axis=1)
drc = drc.dropna(subset=['cluster'])
drc = pd.concat([drc, embedding],axis=1)


In [20]:
drc.to_csv("drcend.csv")

drc = pd.read_csv("drcend.csv")

In [21]:
drc['text'] = drc['title'].fillna('') + drc['abstract'].fillna('') + ' ' + drc['keywords'].fillna('').str.replace('|', ' ')

In [23]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

# This uses code from here: https://stackoverflow.com/questions/49564176/python-nltk-more-efficient-way-to-extract-noun-phrases
# Defining a grammar & Parser
NP = "NP: {(<JJ\w+>|<NN\w?>)+.*<NN\w?>}" # Extract Noun-Phrases
chunker = RegexpParser(NP)
stopWords = ['article','paper','essay','i','elsevier','inc']
def get_continuous_chunks(text, chunk_func=ne_chunk):
    words=word_tokenize(text)
    wordsFiltered = []
    for w in words:
        if w not in stopWords:
            wordsFiltered.append(w)
        
    chunked = chunk_func(pos_tag(wordsFiltered))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([lemmatizer.lemmatize(token) for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(lemmatizer.lemmatize(named_entity))
                current_chunk = []
        else:
            continue
    return continuous_chunk


drc['nplemma'] = drc['text'].replace(np.nan, '', regex=True).apply(lambda sent: get_continuous_chunks(sent, chunker.parse))

In [24]:
drc['npstring'] = [';'.join(map(str, l)) for l in drc['nplemma']]


In [25]:
vec = CountVectorizer(token_pattern=r'[^;]+')
X = vec.fit_transform(drc['npstring'])
display(pd.DataFrame(X.sum(axis=0),columns=vec.get_feature_names()).sum().sort_values(ascending=False))


case study                             707
business ethic                         634
health care                            442
decision making                        440
responsibility csr                     293
quantum mechanic                       183
climate change                         162
virtue ethic                           162
research ethic                         161
higher level                           156
better understanding                   142
research participant                   142
stakeholder theory                     138
higher education                       131
research project                       126
health professional                    126
decision maker                         117
business practice                      117
corporate social responsibility        114
research program                       111
research study                         103
best interest                          102
research question                      101
ethic commi

In [26]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
stopWords = ['article','paper','essay','i','elsevier','inc','verlag','gmbh','springer']

def get_continuous_chunks(text, chunk_func=ne_chunk):
    chunked = word_tokenize(text)
    tokens = []
    for token in chunked:
        tokens.append(lemmatizer.lemmatize(token))
    words = [word for word,tag in pos_tag(chunked) if tag == 'NN']
    wordsFiltered = []
    for w in words:
        if w not in stopWords:
            wordsFiltered.append(w)
    return wordsFiltered

drc['wordlemma'] = drc['text'].replace(np.nan, '', regex=True).apply(lambda sent: get_continuous_chunks(sent, chunker.parse))
drc['wordlemmastring'] = [';'.join(map(str, l)) for l in drc['wordlemma']]

In [27]:
vec = CountVectorizer(token_pattern=r'[^;]+')
X = vec.fit_transform(drc['wordlemmastring'])

In [28]:
display(pd.DataFrame(X.sum(axis=0),columns=vec.get_feature_names()).sum().sort_values(ascending=False))

research                 15312
theory                   14070
study                     9364
analysis                  7628
approach                  6351
role                      6345
model                     6342
view                      6180
health                    6060
account                   6004
work                      5788
knowledge                 5711
science                   5616
care                      5524
information               5479
business                  5371
way                       5360
problem                   5234
argument                  5185
case                      5160
practice                  5146
nature                    5019
concept                   4856
time                      4777
development               4755
philosophy                4611
evidence                  4576
use                       4380
process                   4374
context                   4295
                         ...  
mountcastle                  1
movement

In [29]:
fullstrsl = []
for x in range(0,clusternum):
    abstracts = list(drc.loc[drc['cluster'] == x]['wordlemma'])
    abstracts = ";".join(';'.join(x) for x in abstracts)
    fullstrsl.append(abstracts)
    
vec = TfidfVectorizer(token_pattern=r'[^;]+')
X = vec.fit_transform(fullstrsl)
# display(pd.DataFrame(X.toarray(), columns=vec.get_feature_names()))

clusterfeatures = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
fullscore = []

surveypages = []
for x in range(0,clusternum):
    scores = zip(vec.get_feature_names(), np.asarray(X[x,:].sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    myscores = sorted_scores[0:20]

    with open("report/mds/Cluster_Nr.%03i.md" %x, "w") as text_file:

        text_file.write('# Cluster ' + str(x) +'\n')

        text_file.write('![](img/Cluster_Nr_'+str(x)+'.png){width=100%}\n\n')
        text_file.write('![](img_time/Cluster_Nr_'+str(x)+'.png){width=100%}\n\n')

        text_file.write('## Nouns \n')
        text_file.write('\\colorlet{mytextcolor}{black}\n\n')
        grams = [i[0] for i in myscores]
        occs = [i[1] for i in myscores]
        k = []
        for it in range(0,len(grams)):
            k.append('\\textcolor{mytextcolor!'+str((occs[it]/(max(occs)/100))*0.5+50)+'}{'+str(grams[it])+'}')
        text_file.write(' \\textbullet{} '.join(k))
        text_file.write('\n')
        
        
        
    page = {}
    page["name"] = "page"+str(x)
    elements = []
    elements.append({"type": "html","name": "nouns","html": "<h2> Nouns </h2>"})

    spans = []
    for it in range(0,len(grams)):
        spans.append('<span style=\"opacity:'+str(((occs[it]/(max(occs)/100))*0.01)*0.6+0.4)+';\">'+str(grams[it])+'</span>')
        
    elements.append({"type": "html","name": "nouns","html": str(' &#9679; '.join(spans))})

    page["elements"] = elements
    #Build json for survey;
    surveypages.append(page)
  
    
    scorelist = []
    for s in myscores:
        scorelist.append(s[0])
    fullscore.append(scorelist)

In [30]:
fullstrsl = []
for x in range(0,clusternum):
    abstracts = list(drc.loc[drc['cluster'] == x]['nplemma'])
    abstracts = ";".join(';'.join(x) for x in abstracts)
    fullstrsl.append(abstracts)
    
vec = TfidfVectorizer(token_pattern=r'[^;]+')
X = vec.fit_transform(fullstrsl)
# display(pd.DataFrame(X.toarray(), columns=vec.get_feature_names()))

clusterfeatures = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
fullscore = []

surveypages = []
for x in range(0,clusternum):
    scores = zip(vec.get_feature_names(), np.asarray(X[x,:].sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    myscores = sorted_scores[0:20]
    
    with open("report/mds/Cluster_Nr.%03i.md" %x, "a") as text_file:
        text_file.write('\n## Noun phrases \n')
        text_file.write('\\colorlet{mytextcolor}{black}\n\n')
        grams = [i[0] for i in myscores]
        occs = [i[1] for i in myscores]
        k = []
        for it in range(0,len(grams)):
            k.append('\\textcolor{mytextcolor!'+str((occs[it]/(max(occs)/100))*0.5+50)+'}{'+str(grams[it])+'}')
#             k.append('\\textcolor{mytextcolor!'+str(1)+'}{'+str(grams[it])+'}')

        text_file.write(' \\textbullet{} '.join(k))
        text_file.write('\n')
        
        
        
    page = {}
    page["name"] = "page"+str(x)
    elements = []
    elements.append({"type": "html","name": "Noun phrases","html": "<h2> Unigrams </h2>"})

    spans = []
    for it in range(0,len(grams)):
        spans.append('<span style=\"opacity:'+str(((occs[it]/(max(occs)/100))*0.01)*0.6+0.4)+';\">'+str(grams[it])+'</span>')
        
    elements.append({"type": "html","name": "unigrams","html": str(' &#9679; '.join(spans))})

    page["elements"] = elements
    surveypages.append(page)
  
    
    scorelist = []
    for s in myscores:
        scorelist.append(s[0])
    fullscore.append(scorelist)




In [33]:
dicts = {}
keys = range(clusternum)
for i in keys:
        dicts[i] = mk.RecordCollection()

for R in RC:
    try: 
        dicts[drc.loc[drc['id'] == R['id']]['cluster'].values[0]].add(R)
    except Exception:
        pass

In [38]:
import slugify
for x in keys:
    dicts[x].writeFile("clusters_filtered/"+ (str(x))[0:100] +".txt")

In [34]:
for x in keys:

    #JOURNALS
    ranked_journals = dicts.get(x).rankedSeries('journal', giveCounts = True, giveRanks = False, pandasMode = False)
    with open("report/mds/Cluster_Nr.%03i.md" %x, "a") as text_file:
        text_file.write('\n## Journals \n')

        grams = [i[0] for i in ranked_journals[0:5]]
        occs = [i[1] for i in ranked_journals[0:5]]
        k = []
        for it in range(0,len(grams)):
            k.append('\\textcolor{mytextcolor!'+str((occs[it]/(max(occs)/100))*0.5+50)+'}{'+str(grams[it]).lower().title()+'}')
        text_file.write(' \\textbullet{} '.join(k))
        text_file.write('\n')
        
    # Write survey:
    
    surveypages[x]['elements'].append({"type": "html","name": "Journals","html": "<h2> Journals </h2>"})
    spans = []
    for it in range(0,len(grams)):
        spans.append('<span style=\"opacity:'+str(((occs[it]/(max(occs)/100))*0.01)*0.6+0.4)+';\">'+str(grams[it]).lower().title()+'</span>')
        
    surveypages[x]['elements'].append({"type": "html","name": "journals","html": str(' &#9679; '.join(spans))})

    # CITED RECORDS:
    ranked_citations = dicts.get(x).rankedSeries('CR', giveCounts = True, giveRanks = False, pandasMode = False)
    with open("report/mds/Cluster_Nr.%03i.md" %x, "a") as text_file:
        text_file.write('\n## Most Cited Works \n')

        grams = [i[0] for i in ranked_citations[0:15]]
        occs = [i[1] for i in ranked_citations[0:15]]
        k = []
        for it in range(0,len(grams)):
            k.append('\\textcolor{mytextcolor!'+str((occs[it]/(max(occs)/100))*0.5+50)+'}{'+str(grams[it]).lower().title()
                     .replace('_','\_')+'}')
        text_file.write(' \\textbullet{} '.join(k))
        text_file.write('\n')
    
        # Write survey:
    
    surveypages[x]['elements'].append({"type": "html","name": "works","html": "<h2> Most cited works </h2> <p> You can click on underlined titles to find out what they are!</p>"})
    spans = []
    for it in range(0,len(grams)):
        if getattr(grams[it], 'DOI'):
#             print(getattr(grams[it], 'DOI'))
            spans.append('<a href=\"https://doi.org/'+str(getattr(grams[it], 'DOI')).replace('DOI:','').replace('DOI','').replace(' ','').replace(':','\:')+
                         '\" target=\"_blank\"><span style=\"opacity:'+str(((occs[it]/(max(occs)/100))*0.01)*0.6+0.4)+';\">'+
                         str(getattr(grams[it],'author')).lower().title()+', '+str(getattr(grams[it],'year'))+'</span></a>')
        else:
            spans.append('<span style=\"opacity:'+str(((occs[it]/(max(occs)/100))*0.01)*0.6+0.4)+';\">'+str(getattr(grams[it],'author')).lower().title()+', '+str(getattr(grams[it],'year'))+'</span>')

    surveypages[x]['elements'].append({"type": "html","name": "journals","html": str(' &#9679; '.join(spans))})
    
    surveypages[x]['elements'].append({"type": "text","name": str("cluster"+str(x)),"title": "Please propose one or several names for this cluster:","placeHolder": ""})
    surveypages[x]['elements'].append({"type": "rating","name": str("certitude"+str(x)),"title": "How certain are you in your choice?",
                                       "minRateDescription": "Very uncertain", "maxRateDescription": "Completely certain"})

    surveypages[x]['elements'].append({"type": "radiogroup","name": "exit1","title": "Do you need to end the survey?",
                    "choices": ["Yes"],"colCount": 0})   
    
# Set up survey:
survey = {}
survey["triggers"] = [{"type": "complete","name": "exit1","operator": "equal","value": "Yes"}, {"type": "complete","name": "exit2","operator": "equal","value": "Yes"}]
    
survey["firstPageIsStarted"] = True
survey["startSurveyText"] = "Start Survey"
#add landing-page:
surveypages = [{"name": "landing","elements": [{"type": "html","name": "landing","html": "<h2>A survey on the econ literature </h2> In this survey we will ask you to identify some clusters of the economic literature."}]}] + surveypages
survey["pages"] = surveypages
# print(json.dumps(survey))
survey = json.dumps(survey)
parsed = json.loads(survey)
with open('survey/survey.json', 'w') as outfile:
    json.dump(parsed, outfile, indent=4)