In [1]:
import mysql.connector as mysql
import json

with open('config.json', 'r') as read_file:
    client = json.load(read_file)

In [220]:
db = mysql.connect(
    host = 'localhost',
    buffered = True,
    user = client['MySQL User'],
    passwd = client['MySQL Pass'],
    database = 'scopus'
)

# column names
subject_col = ['asjc_code', 'top', 'middle', 'low']
source_col = ['source_id_scp', 'title', 'url', 'type', 'issn', 'e_issn', 'isbn', 'publisher', 'country']
source_subject_col = ['source_id', 'subject_id']
paper_col = ['paper_id_scp', 'eid', 'title', 'type', 'type_description', 'abstract', 'total_author', 'open_access', 'cited_cnt', 'url', 'article_no', 'fund_no', 'retrieval_time', 'source_id', 'doi', 'volume', 'issue', 'date', 'page_range']
author_col = ['author_id_scp', 'first', 'last', 'initials', 'sex', 'type', 'rank', 'email',]
paper_author_col = ['paper_id', 'author_id', 'author_no']
author_profile_col = ['author_id', 'url', 'type']
department_col = ['department_id', 'name', 'abbreviation', 'type', 'lat', 'lng']
author_department_col = ['author_id', 'department_id']
institution_col = ['institution_id_scp', 'name', 'abbreviation', 'city', 'country', 'url', 'type', 'lat', 'lng']
department_institution_col = ['department_id', 'institution_id']

subject_q = '''INSERT INTO subject (asjc_code, top, middle, low) VALUES (%s, %s, %s, %s)'''
source_q = '''INSERT INTO source (source_id_scp, title, url, type, issn, e_issn, isbn, publisher, country) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'''
source_subject_q = '''INSERT INTO source_subject (source_id, subject_id) VALUES (%s, %s)'''
paper_q = '''INSERT INTO paper (paper_id_scp, eid, title, type, type_description, abstract, total_author, open_access, cited_cnt, url, article_no, fund_no, retrieval_time, source_id, doi, volume, issue, date, page_range) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
author_q = '''INSERT INTO author (author_id_scp, first, last, initials, sex, type, rank, email) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'''
paper_author_q = '''INSERT INTO paper_author (paper_id, author_id, author_no) VALUES (%s, %s, %s)'''
author_profile_q = '''INSERT INTO author_profile (author_id, url, type) VALUES (%s, %s, %s)'''
department_q = '''INSERT INTO department (department_id, name, abbreviation, type, lat, lng) VALUES (%s, %s, %s, %s, %s, %s)'''
author_department_q = '''INSERT INTO author_department (author_id, department_id) VALUES (%s, %s)'''
institution_q = '''INSERT INTO institution (institution_id_scp, name, abbreviation, city, country, url, type, lat, lng) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'''
department_institution_q = '''INSERT INTO department_institution (department_id, institution_id) VALUES (%s, %s)'''

cols = [
    subject_col, source_col, source_subject_col, paper_col, author_col, 
    paper_author_col, author_profile_col, department_col, author_department_col, 
    institution_col, department_institution_col
]

col_names = [
    'subject', 'source', 'source_subject', 'paper', 'author', 
    'paper_author', 'author_profile', 'department', 'author_department', 
    'institution', 'department_institution'
]

for name, col, que in zip(col_names, cols, ques):
    table_name = name
    table_col = col
    q = f'INSERT INTO {table_name} ({", ".join(table_col)}) VALUES ({"%s, " * (len(table_col) - 1)}%s)'

print(db)
cursor = db.cursor()
db.close()

<mysql.connector.connection_cext.CMySQLConnection object at 0x000002358A00D3C8>


In [2]:
# modules

import os
import io
import csv
import json
from collections import OrderedDict
from datetime import datetime
import random

path = 'data\\Sharif University of Technology'
files = list(os.walk(path))[0][2]

with io.open(os.path.join(path, files[0]), 'r', encoding='utf8') as raw:
    data = json.load(raw)

faculties = []
with io.open('data\\faculties.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        if row['Scopus']:
            row['Scopus'] = list(map(int, row['Scopus'].split(',')))
        faculties.append(row)

asjc = []
with io.open('data\\ASJC Codes.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        asjc.append(row)

sources = []
with io.open('data\\Scopus Sources.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        row.pop('Active', None)
        row.pop('Discontinued', None)
        row.pop('Coverage', None)
        row.pop('2016 CiteScore', None)
        row.pop('2017 CiteScore', None)
        row.pop('2018 CiteScore', None)
        row.pop('Medline-sourced', None)
        row.pop('Open Access', None)
        row.pop('Articles in Press Included', None)
        row.pop('Added to list April 2019', None)
        row.pop('Title history indication', None)
        row.pop('Related title to title history indication', None)
        row.pop('Other related title 1', None)
        row.pop('Other related title 2', None)
        row.pop('Other related title 3', None)
        row.pop('Publisher imprints grouped to main Publisher', None)
        
        row['ASJC'] = [int(code) for code in row['ASJC'].split(';') if code != '']
        sources.append(row)

In [None]:
for file in files:
#     print(file)
    year = file.split('.')[0].split('_')[-4][1:]
    with io.open(os.path.join(path, file), 'r', encoding='utf8') as raw:
        data = json.load(raw)
    data = data['search-results']['entry']
    for paper in data:
        print(int(paper['dc:identifier'].split(':')[1]))
        rnd_source = random.randint(100000,200000)
        source_info = [
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:publicationName'] if 'prism:publicationName' in paper.keys() else 'No Name!'),
            'https://www.scopus.com/sourceid/', # url
            (paper['prism:issn'] if 'prism:issn' in paper.keys() else None),
            (paper['prism:isbn'][0]['$'] if 'prism:isbn' in paper.keys() else None),
            None, # **subject
            None, # **publisher
            (paper['prism:aggregationType'] if 'prism:aggregationType' in paper.keys() else None),
        ]
        cursor.execute(source_q, source_info)
        
        paper_info = [
            int(paper['dc:identifier'].split(':')[1]),
            paper['eid'],
            paper['dc:title'],
            (paper['subtype'] if 'subtype' in paper.keys() else None),
            (paper['subtypeDescription'] if 'subtypeDescription' in paper.keys() else None),
            (paper['dc:description'] if 'dc:description' in paper.keys() else None),
            paper['author-count']['$'],
            (paper['openaccess'] if 'openaccess' in paper.keys() else None),
            paper['citedby-count'],
            paper['link'][-2]['@href'],
            (paper['article-number'] if 'article-number' in paper.keys() else None),
            (paper['fund-no'] if 'fund-no' in paper.keys() else None),
            datetime.utcfromtimestamp(int(file.split('.')[0].split('_')[-1])).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:doi'] if 'prism:doi' in paper.keys() else None),
            (paper['prism:volume'] if 'prism:volume' in paper.keys() else None),
            (paper['prism:issueIdentifier'] if 'prism:issueIdentifier' in paper.keys() else None),
            (datetime.strptime(paper['prism:coverDate'], '%Y-%m-%d').strftime('%Y-%m-%d') if 'prism:coverDate' in paper.keys() else year),
            (paper['prism:pageRange'] if 'prism:pageRange' in paper.keys() else None),
        ]
#         cursor.execute(paper_q, paper_info)
        
        author_info = []
        paper_author_info = []
        for auth in paper['author']:
            author_info.append(
                [
                    int(auth['authid']),
                    auth['given-name'],
                    auth['surname'],
                    auth['initials'],
                    auth['author-url'],
                    None, # sex
                    None, # type
                    None, # rank
                    None, # email
                ]
            )
            
            paper_author_info.append(
                [
                    paper_info[0],
                    int(auth['authid']),
                    auth['@seq'],
                ]
            )
        db.commit()
db.close()
print('Finished!')

In [141]:
class Database:
    def __init__(self, config: dict, db_name: str, host: str = 'localhost', port: int = 3306, buffered: bool = True):
        print('@ __init__')
        self._params = {
            'host': host,
            'buffered': buffered,
            'user': config['MySQL User'],
            'pass': config['MySQL Pass'],
        }
        self.db_name = db_name
        self.db = None
        self.cursor = None
        self.tables = []
        print('__init__ done!')
    
    def _connect(self):
        print('@ _connect')
        if not self.db:
            self.db = mysql.connect(
                host = self._params['host'],
                buffered = self._params['buffered'],
                user = self._params['user'],
                password = self._params['pass'],
                database = self.db_name
            )
        print('_connect done!')
        return self.db
    
    def _cursor(self):
        print('@ _cursor')
        if not self.db:
            self._connect()
        if not self.db.is_connected():
            self.db.reconnect()
        if not self.cursor:
            self.cursor = self.db.cursor()
        print('_cursor done!')
        return self.cursor
    
    def _execute(self, query, values = [], fetch: bool = False, many: bool = False, close_cursor: bool = False):
        print('@ _execute')
        if many:
            self._cursor().executemany(query, values)
        else:
            self._cursor().execute(query, values)
        if fetch:
            server_response = self.cursor.fetchall()
        else:
            server_response = self.cursor
        # print(server_response)
        print('_execute done!')
        if close_cursor:
            self.cursor.close()
            self.cursor = None
        return server_response
    
    def _close(self):
        print('@ _close')
        if self.db.is_connected():
            self.db.close()
        print('Closed!')
    
    def _show_tables(self):
        print('@ _show_tables')
        return [table[0] for table in self._execute(query = 'SHOW TABLES', fetch = True)]
    
    def _has_table(self, table_name):
        print('@ _has_table')
        table_names = self._show_tables()
        if table_name in table_names:
            print('_has_table done!')
            return True
        print('_has_table done!')
        return False
    
    def _column_names(self, table_name):
        print('@ _column_names')
        return [col[0] for col in self.describe(table_name)]
    
    def _insert(self, table_name, data: list):
        print('@ _insert')
        if not self._has_table(table_name):
            return f'Error! "{table_name}" table not found'
        column_names = self._column_names
        
        # assuming all data rows have the same columns
        # data is a list of dictionaries, of which the keys are column names
        for col in data[0].keys():
            if col not in column_names:
                return f'Error! "{col}" column not found'
        
        column_names = list(data[0].keys())
        query = f'INSERT INTO {table_name} ({", ".join(column_names)}) VALUES ({"%s, " * (len(column_names) - 1)}%s)'
        values = []
        for row in data:
            values.append(tuple(row[col] for col in column_names))
        try:
            self._execute(query, values, many = True)
            print('_insert done!')
            return self.db.commit()
        except Exception as e:
            print(f'error here: {e}')
            self._close()
    
    def describe(self, table_name: str = ''):
        print('@ describe')
        if table_name:
            query = f'DESCRIBE {table_name}'
            print('describe done!')
            return self._execute(query = query, fetch = True)
        server_response = self._show_tables()
        for table in server_response:
            self.tables.append({table: self.describe(table)})
        print('describe done!')
        return self.tables
    
    def _read(self, table_name: str, search: dict, select = '*', result_columns: bool = False):
        print('@ _read')
        if not self._has_table(table_name):
            return f'Error! "{table_name}" table not found'
        
        query = (f'SELECT {select} FROM {table_name} WHERE ' 
            + ' AND '.join([f'{k} {v["operator"]} {v["value"]}' for k, v in search.items()]))
        
        server_response = self._execute(query = query, fetch = True, close_cursor = True)
        print('got the response from _execute')
        if result_columns:
            result = []
            column_names = self._column_names(table_name)
            for row in server_response:
                result.append({name: value for name, value in zip(column_names, row)})
            print('_read done!')
            return result
        print('_read done!')
        return server_response
    
    def _has_row(self, table_name, search: dict):
        print('@ _has_row')
        server_response = self._read(table_name, search, select = 'COUNT(*)')
        print('_has_row done!')
        return server_response[0][0]
    
    def _table_order(self):
        print('@ _table_order')
        return [
            {'source', 'subject', 'country', 'paper_funding'},
            {'source_subject', 'paper'},
            {'author', 'keyword'},
            {'paper_author', 'paper_keyword', 'department', 'author_profile'},
            {'institution', 'author_department'},
            {'department_institution'},
        ]
    
    def db_insert(self, data: list):
        print('@ db_insert')
        # data is a list of 2-layared dictionaries:
        # 1st layer for the table names and 2nd for the column names
        pass

In [29]:
def data_inspector(data: dict):
    warnings = []
    top_keys = [
        'source-id', 'prism:publicationName', 'prism:coverDate',
        'dc:identifier', 'eid', 'dc:title', 'subtype', 'author-count', 'openaccess', 'citedby-count', 'link', 
        'author', 'affiliation',
    ]
    author_keys = ['authid', '@seq', 'afid']
    affiliation_keys = ['afid', 'affilname']
    
    keys = data.keys()
    for key in top_keys:
        if key not in keys:
            warnings.append(key)
    if 'author' not in warnings:
        for author in data['author']:
            keys = author.keys()
            for key in author_keys:
                if key not in keys:
                    warnings.append(f'author:{key}')
    if 'affiliation' not in warnings:
        for affiliation in data['affiliation']:
            keys = affiliation.keys()
            for key in affiliation_keys:
                if key not in keys:
                    warnings.append(f'affiliation:{key}')
    return warnings

paper_info = [
    (paper['dc:description'] if 'dc:description' in paper.keys() else None),
    paper['author-count']['$'],
    (paper['openaccess'] if 'openaccess' in paper.keys() else None),
    paper['citedby-count'],
    paper['link'][-2]['@href'],
    (paper['article-number'] if 'article-number' in paper.keys() else None),
    (paper['fund-no'] if 'fund-no' in paper.keys() else None),
    datetime.utcfromtimestamp(int(file.split('.')[0].split('_')[-1])).strftime('%Y-%m-%d %H:%M:%S'),
    datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
    (paper['prism:doi'] if 'prism:doi' in paper.keys() else None),
    (paper['prism:volume'] if 'prism:volume' in paper.keys() else None),
    (paper['prism:issueIdentifier'] if 'prism:issueIdentifier' in paper.keys() else None),
    (datetime.strptime(paper['prism:coverDate'], '%Y-%m-%d').strftime('%Y-%m-%d') if 'prism:coverDate' in paper.keys() else year),
    (paper['prism:pageRange'] if 'prism:pageRange' in paper.keys() else None),
]

author_info = []
paper_author_info = []
for auth in paper['author']:
    author_info.append(
        [
            int(auth['authid']),
            auth['given-name'],
            auth['surname'],
            auth['initials'],
            auth['author-url'],
            None, # sex
            None, # type
            None, # rank
            None, # email
        ]
    )

    paper_author_info.append(
        [
            paper_info[0],
            int(auth['authid']),
            auth['@seq'],
        ]
    )

def key_get(data: dict, keys: dict_keys, key: str):
    result = (data[key] if key in keys else None)
    if type(result) == list:
        return result[0]['$']
    if type(result) == dict:
        return result['$']
    return result

def data_cleaner(data: dict):
    # data is a dictionary containing the info about 1 paper
    warnings = data_inspector(data)
    if 'openaccess' in warnings:
        data['openaccess'] = '0'
        warnings.pop('openaccess')
    if 'author:afid' in warnings:
        warnings.pop('author:afid')
    if len(warnings):
        return {'warnings': warnings}
    
    keys = data.keys()
    url = ''
    for link in data['link']:
        if link['@ref'] == 'scopus':
            url = link['@href']
            break
    if not url:
        return {'warnings': ['paper url']}
    
    result = {
        'source': {
            'source_id_scp': int(data['source-id']), 
            'title': data['prism:publicationName'], 
            'url': 'https://www.scopus.com/sourceid/' + data['source-id'], 
            'type': key_get(data, keys, 'prism:aggregationType'), 
            'issn': key_get(data, keys, 'prism:issn'), 
            'e_issn': key_get(data, keys, 'prism:eIssn'), 
            'isbn': key_get(data, keys, 'prism:isbn'), 
            'publisher': None, 
            'country_id': None
        },
        'paper_funding': {
            'agency_id_scp': key_get(data, keys, 'fund-no'), 
            'agency':  key_get(data, keys, 'fund-sponsor'), 
            'agency_acronym': key_get(data, keys, 'fund-acr'), 
        },
        'paper': {
            'url': url,
            'paper_id_scp': int(data['dc:identifier'].split(':')[1]),
            'eid': data['eid'],
            'title': data['dc:title'],
            'type': data['subtype'],
            'type_description': key_get(data, keys, 'subtypeDescription'),
            'cited_cnt': data['citedby-count'],
            'volume': key_get(data, keys, 'prism:volume'),
            'issue': key_get(data, keys, 'prism:volume'),
            'page_range': key_get(data, keys, 'prism:volume'),
            'date': key_get(data, keys, 'prism:volume'),
            'doi': key_get(data, keys, 'prism:volume'),
            'open_access': data['openaccess'],
            'abstract': key_get(data, keys, 'dc:description'),
            'article_no': key_get(data, keys, 'prism:volume'),
            'agency_id_scp': None,
            'total_authors': key_get(data, keys, 'author-count')
        },
        'author': {
            
        },
        'keyword': {},
        'paper_author': {},
        'paper_keyword': {},
        'department': {},
        'author_department': {},
        'institution': {},
        'department_institution': {},
    }

In [144]:
d = Database(config = client, db_name = 'scopus')
# d.insert_first('subject', [{'asjc_code': 12, 'top': 'hi', 'middle': 'hii', 'low': 'hiii'},{'asjc_code': 15, 'top': 'bye', 'middle': 'byee', 'low': 'byeee'}])
# d._read('subject', {'asjc_code': {'value': '12', 'operator': '>'}}, result_columns=False)
d._has_row('subject', {'asjc_code': {'value': '12', 'operator': '>'}})
# d._table_order()
# print(d.describe('subject'))
# d._close()

@ _has_row
@ _read
@ _has_table
@ _show_tables
@ _execute
@ _cursor
_cursor done!
_execute done!
_has_table done!
@ _execute
@ _cursor
_cursor done!
_execute done!
got the response from _execute
_read done!
_has_row done!


3

In [30]:
cnt = 0
for file in files:
#     print(file)
    year = file.split('.')[0].split('_')[-4][1:]
    with io.open(os.path.join(path, file), 'r', encoding='utf8') as raw:
        data = json.load(raw)
    data = data['search-results']['entry']
    for paper in data:
        warnings = data_inspector(paper)
        if warnings:
            cnt += 1
            print(file)
            print(paper['dc:identifier'])
            print(warnings)
            print('')
print(cnt)

Sharif University of Technology_y2018_015_S9J79E_1558880343.txt
SCOPUS_ID:85048800068
['openaccess']

Sharif University of Technology_y2018_016_S9J79E_1558880345.txt
SCOPUS_ID:85056004836
['author:afid']

Sharif University of Technology_y2018_024_S9J79E_1558880364.txt
SCOPUS_ID:85041011942
['openaccess']

Sharif University of Technology_y2018_024_S9J79E_1558880364.txt
SCOPUS_ID:85052450133
['dc:title']

Sharif University of Technology_y2018_025_S9J79E_1558880366.txt
SCOPUS_ID:85052434975
['author:afid']

Sharif University of Technology_y2018_033_S9J79E_1558880385.txt
SCOPUS_ID:85044017205
['author:afid']

Sharif University of Technology_y2018_060_S9J79E_1558880448.txt
SCOPUS_ID:85029503975
['source-id']

Sharif University of Technology_y2018_063_S9J79E_1558880455.txt
SCOPUS_ID:85042594707
['author:afid']

Sharif University of Technology_y2018_078_S9J79E_1558880491.txt
SCOPUS_ID:85062311071
['author:afid']

9
