# Annotating dataset with Scopus/WOS..

This document allow annotate the current dataset extracted from ISTEX database by using SCOPUS categories as annotation.

In [1]:
ALL_LABELS = True

In [2]:
import pymongo
from pymongo import MongoClient
import tqdm
from collections import defaultdict, Counter
import json
import re 

client = MongoClient('localhost', 27017)
db = client.istex
cambridge = db.cambridge
springer = db.springer
total = springer.find({'categories': {'$exists': True}, 'abstract': {'$exists': True}, 'title': {'$exists': True}}).count() + cambridge.find({'categories': {'$exists': True}, 'abstract': {'$exists': True}, 'title': {'$exists': True}}).count()
categories = defaultdict(set)

cnt = Counter()

with open('springer_cambridge.txt', 'w') as outfile:
    for col in [springer, cambridge]:
        for i in tqdm.tqdm(col.find({'categories': {'$exists': True}, 'abstract': {'$exists': True}, 'title': {'$exists': True}, 'language': 'eng'})):
            for ar, v in i['categories'].iteritems():
                if ar == 'scopus':
                    cat = set()
                    for el in v:
                        if ALL_LABELS:
                                cat.add(el)
                        else:
                            if '1 -' in el:
                                cat.add(el)
                    cat = list(cat)
                    cat = map(lambda x: ' '.join(re.findall(r'\b[^\d\W]+\b', x)), cat)
                    json.dump({'abstract': i['abstract'], 'label': ','.join(cat), 'id': i['id'], 'title': i['title']}, outfile)
                    outfile.write('\n')


51805it [00:36, 1435.97it/s]
96642it [00:55, 1729.12it/s]


In [3]:
import pandas as pd

d = pd.read_json("springer_cambridge.txt", orient='records', lines=True)
d = d[d.label != '']

In [4]:
d.label.head()

0    Life Sciences,Environmental Science,Aquatic Sc...
1    Life Sciences,Pharmacology medical,Pharmacolog...
2    Biochemistry Genetics and Molecular Biology,Li...
3    Life Sciences,Environmental Science,Aquatic Sc...
4    Life Sciences,Microbiology,Genetics,Biochemist...
Name: label, dtype: object

In [5]:
d.label = d.label.apply(lambda x: x.lower().strip().replace(' ', '_'))
d.label[0]

u'life_sciences,environmental_science,aquatic_science,ecology,agricultural_and_biological_sciences,physical_sciences,ecology_evolution_behavior_and_systematics'

In [6]:
categories = set()
for l in tqdm.tqdm(d.label):
    l = l.split(',')
    for c in l:
        categories.add(c)
categories

100%|██████████| 83933/83933 [00:00<00:00, 286187.37it/s]


{u'accounting',
 u'agricultural_and_biological_sciences',
 u'agricultural_and_biological_sciences_miscellaneous',
 u'agronomy_and_crop_science',
 u'algebra_and_number_theory',
 u'analysis',
 u'analytical_chemistry',
 u'anatomy',
 u'anesthesiology_and_pain_medicine',
 u'animal_science_and_zoology',
 u'anthropology',
 u'applied_mathematics',
 u'applied_microbiology_and_biotechnology',
 u'applied_psychology',
 u'aquatic_science',
 u'archaeology',
 u'architecture',
 u'artificial_intelligence',
 u'arts_and_humanities',
 u'arts_and_humanities_miscellaneous',
 u'astronomy_and_astrophysics',
 u'atmospheric_science',
 u'atomic_and_molecular_physics_and_optics',
 u'behavioral_neuroscience',
 u'biochemistry',
 u'biochemistry_genetics_and_molecular_biology',
 u'biochemistry_medical',
 u'bioengineering',
 u'biological_psychiatry',
 u'biomaterials',
 u'biomedical_engineering',
 u'biophysics',
 u'biotechnology',
 u'business_and_international_management',
 u'business_management_and_accounting',
 u'bus

In [7]:
import numpy as np

for c in categories:
    d[c] = np.full((d.shape[0],), 0)
    d[c][d.label.str.contains(c)] = 1
d.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,abstract,id,label,title,computational_mathematics,oncology,chemical_engineering,mathematical_physics,health_toxicology_and_mutagenesis,cultural_studies,...,environmental_science_miscellaneous,public_administration,general_engineering,general_computer_science,physics_and_astronomy_miscellaneous,biotechnology,chemistry,inorganic_chemistry,information_systems,anesthesiology_and_pain_medicine
0,Abstract: Uptake and metabolic turnover of a v...,91543AB49EF6104156B755A81D4D3E11B1E3CF19,"life_sciences,environmental_science,aquatic_sc...",Activity of marine psychrophilic bacteria at e...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Summary: The calcium antagonist falipamil, a c...",EFB951031543D6D1B8479250A5B6BBAB6A7B24C8,"life_sciences,pharmacology_medical,pharmacolog...",Atropine and verapamil interactions in healthy...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Summary: Experiments were performed to compare...,9DEA4ED385B3B941293CBADB71A3004AA4BFA390,"biochemistry_genetics_and_molecular_biology,li...",Mechanism of asymmetric block of K channels by...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,Abstract: Cells of the coccolithophoridCricosp...,152CF0B3C7000B08F84612084EE4CC844F692482,"life_sciences,environmental_science,aquatic_sc...",Ultrastructural study and calcium and cadmium ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abstract: A calcium requirement was shown for ...,0866A00EDFAA77712E45BCCAFEFC042E296D3772,"life_sciences,microbiology,genetics,biochemist...",The calcium requirement for functional vesicle...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
d = d.drop(columns='label')

In [9]:
d.to_csv('scopus.csv', encoding='utf-8', index=False)

In [10]:
from sklearn.model_selection import train_test_split


train, test = train_test_split(d, test_size=0.33)

In [11]:
train.to_csv('scopus_train.csv', encoding='utf-8', index=False)
test.to_csv('scopus_test.csv', encoding='utf-8', index=False)

print(train.shape)
print(test.shape)

(56235, 240)
(27698, 240)


In [None]:
test.to_json('scopus_test.json', orient='records', lines=True)

In [None]:
train[train.abstract.str.contains('fiscal policy')]

## FastText formatting

In [None]:
def labeling(row):
    main_part = row['title'] + ' ' + row['abstract']
    to_add = []
    if row['health_sciences']:
        to_add.append('__label__health_sciences ')
    if row['life_sciences']:
        to_add.append('__label__life_sciences')
    if row['physical_sciences']:
        to_add.append('__label__physical_sciences') 
    if row['social_sciences']:
        to_add.append('__label__social_sciences')
    row['abstract'] = ' '.join(to_add) + ' ' + main_part
    return row
        
ft = d.apply(labeling, axis=1)

In [None]:
import codecs

train, test = train_test_split(ft, test_size=0.33)
with codecs.open('fasttext_abstract_train.txt', 'w', encoding='utf8') as f:
    for i in train.abstract:
        f.write(i)
        f.write('\n')

In [None]:
with codecs.open('fasttext_abstract_test.txt', 'w', encoding='utf8') as f:
    for i in test.abstract:
        f.write(i)
        f.write('\n')

In [None]:
ft.abstract[0]