In [1]:
import mysql.connector as mysql
import json

with open('config.json', 'r') as read_file:
    client = json.load(read_file)

In [120]:
db = mysql.connect(
    host = 'localhost',
    buffered = True,
    user = client['MySQL User'],
    passwd = client['MySQL Pass'],
    database = 'scopus'
)

# column names
source_col = [
    'source_id', 'title', 'url', 'issn', 'isbn', 'subject', 'publisher', 'type'
]
paper_col = [
    'paper_id', 'eid', 'title', 'type', 'type_description', 'abstract', 
    'total_author', 'open_access', 'cited_cnt', 'url', 'article_no', 
    'fund_no', 'retrieval_time',  'create_time',  'source_id',  'doi', 
    'volume',  'issue',  'date',  'page_range'
]
paper_author_col = ['paper_id', 'author_id', 'author_no']
author_col = [
    'author_id', 'first', 'last', 'initials', 'url', 'sex', 'type', 'rank', 'email',
]
author_department_col = ['author_id', 'department_id']
department_col = [
    'department_id', 'name', 'abbreviation', 'type', 'lat', 'lng'
]
department_institution_col = ['department_id', 'institution_id']
institution_col = [
    'institution_id', 'name', 'abbreviation', 'city', 'country', 'url', 'type', 'lat', 'lng'
]

print(db)
cursor = db.cursor()

<mysql.connector.connection_cext.CMySQLConnection object at 0x000001920ADC0518>


In [115]:
cursor.execute('DESCRIBE author')
databases = cursor.fetchall()
for database in databases:
    print(database)

('author_id', 'int(11)', 'NO', 'PRI', None, '')
('first', 'varchar(45)', 'YES', '', None, '')
('last', 'varchar(45)', 'YES', '', None, '')
('initials', 'varchar(45)', 'YES', '', None, '')
('url', 'varchar(256)', 'YES', '', None, '')
('sex', 'tinyint(1)', 'YES', '', None, '')
('type', 'varchar(45)', 'YES', '', None, '')
('rank', 'varchar(45)', 'YES', '', None, '')
('email', 'varchar(128)', 'YES', '', None, '')


In [105]:
source_q = '''
    INSERT INTO scopus.source (
        source_id, title, url, issn, isbn, subject, publisher, type
    ) VALUES (
        %s, %s, %s, %s, %s, %s, %s, %s
    )
'''
paper_q = '''
    INSERT INTO paper (
        paper_id, eid, title, type, type_description, abstract, 
        total_author, open_access, cited_cnt, url, article_no, 
        fund_no, retrieval_time,  create_time, source_id, doi, 
        volume,  issue,  date,  page_range
    ) VALUES (
        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 
        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
    )
'''
author_q = '''
    INSERT INTO paper_author (
        author_id, first, last, initials, url, sex, type, rank, email
    ) VALUES (
        %s, %s, %s, %s, %s, %s, %s, %s
    )
'''
paper_author_q = '''
    INSERT INTO paper_author (
        paper_id, author_id, author_no
    ) VALUES (
        %s, %s, %s
    )
'''
department_q = '''
    INSERT INTO paper_author (
        department_id, name, abbreviation, type, lat, lng
    ) VALUES (
        %s, %s, %s, %s, %s, %s
    )
'''
author_department_q = '''
    INSERT INTO paper_author (
        author_id, department_id
    ) VALUES (
        %s, %s
    )
'''
institution_q = '''
    INSERT INTO paper_author (
        institution_id, name, abbreviation, city, country, url, type, lat, lng
    ) VALUES (
        %s, %s, %s, %s, %s, %s, %s, %s, %s
    )
'''
department_institution_q = '''
    INSERT INTO paper_author (
        department_id, institution_id
    ) VALUES (
        %s, %s
    )
'''

In [26]:
values = (1358,'Chem Proceedings','https://www.google.com',12345678,123,'ChE','Sharif','Journal')
cursor.execute(source_q, values)
db.commit()

print(cursor.rowcount, "record inserted")
db.close()
# cursor.execute(paper_q, (123,4345,'awef','ar','','',1,1,1,'wef','','','2001-12-12 12:13:24','2038-01-19 03:14:07',1356,'','','','2008-05-06',''))

In [91]:
# modules

import os
import io
import csv
import json
from collections import OrderedDict
from datetime import datetime
import random

path = 'data\\Sharif University of Technology'
files = list(os.walk(path))[0][2]

with io.open(os.path.join(path, files[0]), 'r', encoding='utf8') as raw:
    data = json.load(raw)

faculties = []
with io.open('data\\faculties.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        if row['Scopus']:
            row['Scopus'] = list(map(int, row['Scopus'].split(',')))
        faculties.append(row)

In [121]:
for file in files:
#     print(file)
    year = file.split('.')[0].split('_')[-4][1:]
    with io.open(os.path.join(path, file), 'r', encoding='utf8') as raw:
        data = json.load(raw)
    data = data['search-results']['entry']
    for paper in data:
        print(int(paper['dc:identifier'].split(':')[1]))
        rnd_source = random.randint(100000,200000)
        source_info = [
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:publicationName'] if 'prism:publicationName' in paper.keys() else 'No Name!'),
            'https://www.scopus.com/sourceid/', # url
            (paper['prism:issn'] if 'prism:issn' in paper.keys() else None),
            (paper['prism:isbn'][0]['$'] if 'prism:isbn' in paper.keys() else None),
            None, # **subject
            None, # **publisher
            (paper['prism:aggregationType'] if 'prism:aggregationType' in paper.keys() else None),
        ]
        cursor.execute(source_q, source_info)
        
        paper_info = [
            int(paper['dc:identifier'].split(':')[1]),
            paper['eid'],
            paper['dc:title'],
            (paper['subtype'] if 'subtype' in paper.keys() else None),
            (paper['subtypeDescription'] if 'subtypeDescription' in paper.keys() else None),
            (paper['dc:description'] if 'dc:description' in paper.keys() else None),
            paper['author-count']['$'],
            (paper['openaccess'] if 'openaccess' in paper.keys() else None),
            paper['citedby-count'],
            paper['link'][-2]['@href'],
            (paper['article-number'] if 'article-number' in paper.keys() else None),
            (paper['fund-no'] if 'fund-no' in paper.keys() else None),
            datetime.utcfromtimestamp(int(file.split('.')[0].split('_')[-1])).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:doi'] if 'prism:doi' in paper.keys() else None),
            (paper['prism:volume'] if 'prism:volume' in paper.keys() else None),
            (paper['prism:issueIdentifier'] if 'prism:issueIdentifier' in paper.keys() else None),
            (datetime.strptime(paper['prism:coverDate'], '%Y-%m-%d').strftime('%Y-%m-%d') if 'prism:coverDate' in paper.keys() else year),
            (paper['prism:pageRange'] if 'prism:pageRange' in paper.keys() else None),
        ]
#         cursor.execute(paper_q, paper_info)
        
        author_info = []
        paper_author_info = []
        for auth in paper['author']:
            author_info.append(
                [
                    int(auth['authid']),
                    auth['given-name'],
                    auth['surname'],
                    auth['initials'],
                    auth['author-url'],
                    None, # sex
                    None, # type
                    None, # rank
                    None, # email
                ]
            )
            
            paper_author_info.append(
                [
                    paper_info[0],
                    int(auth['authid']),
                    auth['@seq'],
                ]
            )
        db.commit()
db.close()
print('Finished!')

84855594214
84855584394


IntegrityError: 1062 (23000): Duplicate entry '28075' for key 'PRIMARY'

In [63]:
data = data['search-results']['entry']

In [118]:
db.close()