In [None]:
import scholarly as gs
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm
import os.path
from wordcloud import WordCloud
%matplotlib inline
import matplotlib.pyplot as plt

## Clouds

In [None]:
# reading labs
inp_filtered = pd.read_csv('labs_prof_filtered.csv')

In [None]:
def show_cloud(text, title = ''):
    short_title = title.split('(')[0].strip()
    path = 'cloud_' + short_title + '.png'
    if os.path.isfile(path): return
    wordcloud = WordCloud(max_words=1000, width=1600, height=800).generate(text)
    plt.figure(figsize = (20, 10))
    plt.title(title + ' len=' + str(len(text)))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig(path, bbox_inches = 'tight')
    plt.show()

In [None]:
all_titles = ''
f = open('out.txt', 'w')
labs = list(zip(*[inp_filtered[x] for x in list(inp_filtered)]))
i = 0
for lab in labs:
    lab_titles = ''
    lab_pubs = 0
    lab_cits = 0
    short, name, profs = lab
    profs = [x.strip() for x in profs.split(',')]
    for prof in profs:
        filename = 'author-' + str(i) + '.pkl.filled'
        i += 1
        if not os.path.isfile(filename): continue
        author = pickle.load(open(filename, 'rb'))
        for paper in author.publications:
            try:
                title = paper.bib['title']
                #year = paper.bib['year']
                citedby = paper.citedby
                #print((title + ' ') * (1 + (citedby // 1000)))
                titles = (title.lower() + ' ') * (1 + (citedby // 1000))
                lab_titles += titles
                all_titles += titles
                lab_pubs += 1
                lab_cits += citedby
                #print(all_titles)
                #break
                #f.write('%s\n' % ', '.join([str(x) for x in [short, prof, title.lower(), year, citedby]]))
            except:
                #print('Error: ' + str(paper.bib))
                pass
    if len(lab_titles) > 0:
        show_cloud(lab_titles, title = short + ' (' + name + ')' + ' profs=' + str(profs) + ' cits=' + str(lab_cits) + ' mean=' + str(round(lab_cits / lab_pubs)) + ' papers=' + str(lab_pubs))
f.close()

In [None]:
show_cloud(all_titles, title = 'All (All labs)')

In [None]:
# list of all profs
profs = [inp_filtered.get_value(i, 'Professors') for i in range(len(inp_filtered))]
all_profs = []
for prof in profs:
    all_profs += [x.strip() for x in prof.split(',')]
profs = all_profs
del all_profs

In [None]:
for i, prof in enumerate(tqdm(profs)):
    filename = 'author-' + str(i) + '.pkl.filled'
    if not os.path.isfile(filename): continue
    author = pickle.load(open(filename, 'rb'))
    print(author)

## Filling authors

In [None]:
# reading labs
inp_filtered = pd.read_csv('labs_prof_filtered.csv')

In [None]:
inp_filtered.head()

In [None]:
# list of all profs
profs = [inp_filtered.get_value(i, 'Professors') for i in range(len(inp_filtered))]
all_profs = []
for prof in profs:
    all_profs += [x.strip() for x in prof.split(',')]
profs = all_profs
del all_profs

In [None]:
### saving one author to a file
for i, prof in enumerate(tqdm(profs)):
    filename = 'author-' + str(i) + '.pkl'
    if os.path.isfile(filename): continue
    #print(i, prof)
    author = gs.search_author(prof + ' epfl')
    author = list(author)
    #print(author)
    # no ambiguity in author search
    if len(author) != 1:
        print('ERROR PROCESSING ' + prof + ' len=' + str(len(author)))
        continue
    author = author[0]
    with open(filename, 'wb') as f: f.write(pickle.dumps(author))

In [None]:
# remove second names
mapping1 = {}
mapping1['Bryan Alexander Ford'] = 'Bryan Ford'
mapping1['Pearl Pu Faltings'] = 'Pearl Pu'
mapping1['Michael C. Gastpar'] = 'Michael Gastpar'
mapping1['Ola Nils Anders Svensson'] = 'Ola Svensson'
mapping1['François Fleuret'] = 'F Fleuret'
mapping1['Martinus Gijs'] = 'Martin Gijs'
mapping1['Andreas Peter Burg'] = 'Andreas Burg'

In [None]:
### saving one author to a file
for i, prof in enumerate(tqdm(profs)):
    filename = 'author-' + str(i) + '.pkl'
    if os.path.isfile(filename): continue
    if prof in mapping1.keys():
        prof = mapping1[prof]
    #print(i, prof)
    author = gs.search_author(prof + ' epfl')
    author = list(author)
    #print(author)
    # no ambiguity in author search
    if len(author) != 1:
        print('ERROR PROCESSING ' + prof + ' len=' + str(len(author)))
        continue
    author = author[0]
    with open(filename, 'wb') as f: f.write(pickle.dumps(author))

In [None]:
for i, prof in enumerate(tqdm(profs)):
    filename = 'author-' + str(i) + '.pkl'
    if os.path.isfile(filename + '.filled'): continue
    if not os.path.isfile(filename): continue
    author = pickle.load(open(filename, 'rb'))
    author.fill()
    with open(filename + '.filled', 'wb') as f: f.write(pickle.dumps(author))

## Obtaining original csv

In [None]:
inp = pd.read_csv('labs.csv')

In [None]:
inp.head()

In [None]:
mapping = json.loads(open('prof_mapping.txt', 'r').read())

In [None]:
for i in range(len(inp)):
    profs_input = [mapping[x.strip()] for x in inp['Professors'][i].split(',')]
    inp['Professors'][i] = ', '.join(profs_input)
    for col in list(inp.columns):
        inp[col][i] = inp[col][i].strip()

In [None]:
inp.to_csv('labs_prof_filtered.csv', index = False)

## Obtaining mapping

In [None]:
profs = [inp.get_value(i, 'Professors') for i in range(len(inp))]
all_profs = []
for prof in profs:
    all_profs += [x.strip() for x in prof.split(',')]

In [None]:
from google import google

In [None]:
mapping = {}
for prof in all_profs:
    if prof in mapping.keys(): continue
    search_results = google.search("people.epfl.ch " + prof, 1)
    is_found = False
    for res in search_results:
        title = res.name.split(' : ')
        if len(title) == 2 and title[1] == 'Contacts - People@EPFL':
            mapping[prof] = title[0]
            print(prof + " === " + title[0])
            is_found = True
            break
    if not is_found: print('NOT FOUND ' + prof)

In [None]:
for prof in all_profs:
    if prof not in mapping.keys():
        print(prof)

In [None]:
mapping['MER B. Dutoit'] = 'Bertrand Dutoit'
mapping['MER J.-M. Odobez'] = 'Jean-Marc Odobez'
mapping['Dr. M. Cornaglia'] = 'Matteo Cornaglia'

In [None]:
f = open('prof_mapping.txt', 'w')
f.write(json.dumps(mapping))
f.close()