In [1]:
import mysql.connector as mysql
import json

with open('config.json', 'r') as read_file:
    client = json.load(read_file)

In [220]:
db = mysql.connect(
    host = 'localhost',
    buffered = True,
    user = client['MySQL User'],
    passwd = client['MySQL Pass'],
    database = 'scopus'
)

# column names
subject_col = ['asjc_code', 'top', 'middle', 'low']
source_col = ['source_id_scp', 'title', 'url', 'type', 'issn', 'e_issn', 'isbn', 'publisher', 'country']
source_subject_col = ['source_id', 'subject_id']
paper_col = ['paper_id_scp', 'eid', 'title', 'type', 'type_description', 'abstract', 'total_author', 'open_access', 'cited_cnt', 'url', 'article_no', 'fund_no', 'retrieval_time', 'source_id', 'doi', 'volume', 'issue', 'date', 'page_range']
author_col = ['author_id_scp', 'first', 'last', 'initials', 'sex', 'type', 'rank', 'email',]
paper_author_col = ['paper_id', 'author_id', 'author_no']
author_profile_col = ['author_id', 'url', 'type']
department_col = ['department_id', 'name', 'abbreviation', 'type', 'lat', 'lng']
author_department_col = ['author_id', 'department_id']
institution_col = ['institution_id_scp', 'name', 'abbreviation', 'city', 'country', 'url', 'type', 'lat', 'lng']
department_institution_col = ['department_id', 'institution_id']

subject_q = '''INSERT INTO subject (asjc_code, top, middle, low) VALUES (%s, %s, %s, %s)'''
source_q = '''INSERT INTO source (source_id_scp, title, url, type, issn, e_issn, isbn, publisher, country) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'''
source_subject_q = '''INSERT INTO source_subject (source_id, subject_id) VALUES (%s, %s)'''
paper_q = '''INSERT INTO paper (paper_id_scp, eid, title, type, type_description, abstract, total_author, open_access, cited_cnt, url, article_no, fund_no, retrieval_time, source_id, doi, volume, issue, date, page_range) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
author_q = '''INSERT INTO author (author_id_scp, first, last, initials, sex, type, rank, email) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'''
paper_author_q = '''INSERT INTO paper_author (paper_id, author_id, author_no) VALUES (%s, %s, %s)'''
author_profile_q = '''INSERT INTO author_profile (author_id, url, type) VALUES (%s, %s, %s)'''
department_q = '''INSERT INTO department (department_id, name, abbreviation, type, lat, lng) VALUES (%s, %s, %s, %s, %s, %s)'''
author_department_q = '''INSERT INTO author_department (author_id, department_id) VALUES (%s, %s)'''
institution_q = '''INSERT INTO institution (institution_id_scp, name, abbreviation, city, country, url, type, lat, lng) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'''
department_institution_q = '''INSERT INTO department_institution (department_id, institution_id) VALUES (%s, %s)'''

cols = [
    subject_col, source_col, source_subject_col, paper_col, author_col, 
    paper_author_col, author_profile_col, department_col, author_department_col, 
    institution_col, department_institution_col
]

col_names = [
    'subject', 'source', 'source_subject', 'paper', 'author', 
    'paper_author', 'author_profile', 'department', 'author_department', 
    'institution', 'department_institution'
]

for name, col, que in zip(col_names, cols, ques):
    table_name = name
    table_col = col
    q = f'INSERT INTO {table_name} ({", ".join(table_col)}) VALUES ({"%s, " * (len(table_col) - 1)}%s)'

print(db)
cursor = db.cursor()

<mysql.connector.connection_cext.CMySQLConnection object at 0x000002358A00D3C8>


In [None]:
cursor.execute('DESCRIBE %s', 'author')
databases = cursor.fetchall()
for database in databases:
    print(database)

In [26]:
values = (1358,'Chem Proceedings','https://www.google.com',12345678,123,'ChE','Sharif','Journal')
cursor.execute(source_q, values)
db.commit()

print(cursor.rowcount, "record inserted")
db.close()
# cursor.execute(paper_q, (123,4345,'awef','ar','','',1,1,1,'wef','','','2001-12-12 12:13:24','2038-01-19 03:14:07',1356,'','','','2008-05-06',''))

In [12]:
# modules

import os
import io
import csv
import json
from collections import OrderedDict
from datetime import datetime
import random

path = 'data\\Sharif University of Technology'
files = list(os.walk(path))[0][2]

with io.open(os.path.join(path, files[0]), 'r', encoding='utf8') as raw:
    data = json.load(raw)

faculties = []
with io.open('data\\faculties.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        if row['Scopus']:
            row['Scopus'] = list(map(int, row['Scopus'].split(',')))
        faculties.append(row)

asjc = []
with io.open('data\\ASJC Codes.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        asjc.append(row)

sources = []
with io.open('data\\Scopus Sources.csv', 'r', encoding='utf-8-sig') as csvFile:
    reader = csv.DictReader(csvFile)
    for row in reader:
        row.pop('Active', None)
        row.pop('Discontinued', None)
        row.pop('Coverage', None)
        row.pop('2016 CiteScore', None)
        row.pop('2017 CiteScore', None)
        row.pop('2018 CiteScore', None)
        row.pop('Medline-sourced', None)
        row.pop('Open Access', None)
        row.pop('Articles in Press Included', None)
        row.pop('Added to list April 2019', None)
        row.pop('Title history indication', None)
        row.pop('Related title to title history indication', None)
        row.pop('Other related title 1', None)
        row.pop('Other related title 2', None)
        row.pop('Other related title 3', None)
        row.pop('Publisher imprints grouped to main Publisher', None)
        
        row['ASJC'] = [int(code) for code in row['ASJC'].split(';') if code != '']
        sources.append(row)

In [None]:
for file in files:
#     print(file)
    year = file.split('.')[0].split('_')[-4][1:]
    with io.open(os.path.join(path, file), 'r', encoding='utf8') as raw:
        data = json.load(raw)
    data = data['search-results']['entry']
    for paper in data:
        print(int(paper['dc:identifier'].split(':')[1]))
        rnd_source = random.randint(100000,200000)
        source_info = [
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:publicationName'] if 'prism:publicationName' in paper.keys() else 'No Name!'),
            'https://www.scopus.com/sourceid/', # url
            (paper['prism:issn'] if 'prism:issn' in paper.keys() else None),
            (paper['prism:isbn'][0]['$'] if 'prism:isbn' in paper.keys() else None),
            None, # **subject
            None, # **publisher
            (paper['prism:aggregationType'] if 'prism:aggregationType' in paper.keys() else None),
        ]
        cursor.execute(source_q, source_info)
        
        paper_info = [
            int(paper['dc:identifier'].split(':')[1]),
            paper['eid'],
            paper['dc:title'],
            (paper['subtype'] if 'subtype' in paper.keys() else None),
            (paper['subtypeDescription'] if 'subtypeDescription' in paper.keys() else None),
            (paper['dc:description'] if 'dc:description' in paper.keys() else None),
            paper['author-count']['$'],
            (paper['openaccess'] if 'openaccess' in paper.keys() else None),
            paper['citedby-count'],
            paper['link'][-2]['@href'],
            (paper['article-number'] if 'article-number' in paper.keys() else None),
            (paper['fund-no'] if 'fund-no' in paper.keys() else None),
            datetime.utcfromtimestamp(int(file.split('.')[0].split('_')[-1])).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            (int(paper['source-id']) if 'source-id' in paper.keys() else rnd_source),
            (paper['prism:doi'] if 'prism:doi' in paper.keys() else None),
            (paper['prism:volume'] if 'prism:volume' in paper.keys() else None),
            (paper['prism:issueIdentifier'] if 'prism:issueIdentifier' in paper.keys() else None),
            (datetime.strptime(paper['prism:coverDate'], '%Y-%m-%d').strftime('%Y-%m-%d') if 'prism:coverDate' in paper.keys() else year),
            (paper['prism:pageRange'] if 'prism:pageRange' in paper.keys() else None),
        ]
#         cursor.execute(paper_q, paper_info)
        
        author_info = []
        paper_author_info = []
        for auth in paper['author']:
            author_info.append(
                [
                    int(auth['authid']),
                    auth['given-name'],
                    auth['surname'],
                    auth['initials'],
                    auth['author-url'],
                    None, # sex
                    None, # type
                    None, # rank
                    None, # email
                ]
            )
            
            paper_author_info.append(
                [
                    paper_info[0],
                    int(auth['authid']),
                    auth['@seq'],
                ]
            )
        db.commit()
db.close()
print('Finished!')

In [63]:
data = data['search-results']['entry']

In [None]:
db.close()

In [24]:
for code in asjc:
    subject_info = [
        code['Code'],
        code['Top'],
        code['Middle'],
        code['Low'],
    ]
    cursor.execute(subject_q, subject_info)
print('Done!')
db.commit()
db.close()

Done!


In [73]:
class Database:
    def __init__(self, config: dict, db_name: str, host: str = 'localhost', buffered: bool = True):
        self._params = {
            'host': host,
            'buffered': buffered,
            'user': config['MySQL User'],
            'pass': config['MySQL Pass'],
        }
        self.db_name = db_name
        self.db = None
        self.cursor = None
        self.tables = []
    
    def _connect(self):
        if not self.db:
            self.db = mysql.connect(
                host = self._params['host'],
                buffered = self._params['buffered'],
                user = self._params['user'],
                passwd = self._params['pass'],
                database = self.db_name
            )
        return self.db
    
    def _cursor(self):
        if not self.cursor:
            self.cursor = self._connect().cursor()
        return self.cursor
    
    def _execute(self, query, values = [], fetch: bool = False, many: bool = False):
        if many:
            self._cursor().executemany(query, values)
        else:
            self._cursor().execute(query, values)
        if fetch:
            return self.cursor.fetchall()
        else:
            return self.cursor
    
    def _close(self):
        if self.db:
            self.db.close()
            print('Closed!')
    
    def _show_tables(self):
        return [table[0] for table in self._execute(query = 'SHOW TABLES', fetch = True)]
    
    def _has_table(self, table_name):
        table_names = self._show_tables()
        if table_name in table_names:
            return True
        return False
    
    def _column_names(self, table_name):
        return [col[0] for col in self.describe(table_name)]
    
    def _insert(self, table_name, data: list):
        if not self._has_table(table_name):
            return f'Error! "{table_name}" table not found'
        column_names = self._column_names
        
        # assuming all data rows have the same columns
        for col in data[0].keys():
            if col not in column_names:
                return f'Error! "{col}" column not found'
        
        column_names = list(data[0].keys())
        query = f'INSERT INTO {table_name} ({", ".join(column_names)}) VALUES ({"%s, " * (len(column_names) - 1)}%s)'
        values = []
        for row in data:
            values.append(tuple(row[col] for col in column_names))
        try:
            self._execute(query, values, many = True)
            return self.db.commit()
        except Exception as e:
            print(f'error here: {e}')
            self._close()
    
    def describe(self, table_name: str = ''):
        if table_name:
            query = f'DESCRIBE {table_name}'
            return self._execute(query = query, fetch = True)
        server_response = self._show_tables()
        for table in server_response:
            self.tables.append({table: self.describe(table)})
        return self.tables
    
    def _read(self, table_name: str, search: dict, result_columns: bool = False):
        if not self._has_table(table_name):
            return f'Error! "{table_name}" table not found'
        
        query = (f'SELECT * FROM {table_name} WHERE ' 
            + ' AND '.join([f'{k} {v["operator"]} {v["value"]}' for k, v in search.items()]))
        
        server_response = self._execute(query = query, fetch = True)
        if result_columns:
            result = []
            column_names = self._column_names(table_name)
            for row in server_response:
                result.append({name: value for name, value in zip(column_names, row)})
            return result
        return server_response

In [74]:
d = Database(config = client, db_name = 'scopus')

In [76]:
# d.insert_first('subject', [{'asjc_code': 12, 'top': 'hi', 'middle': 'hii', 'low': 'hiii'},{'asjc_code': 15, 'top': 'bye', 'middle': 'byee', 'low': 'byeee'}])
d._read('subject', {'asjc_code': {'value': '12', 'operator': '>'}}, result_columns=False)
# print(d.describe('subject'))
# d._close()

[(339,
  13,
  'bye',
  'byee',
  'byeee',
  datetime.datetime(2019, 6, 3, 18, 52, 47),
  datetime.datetime(2019, 6, 3, 18, 52, 47)),
 (342,
  14,
  'bye',
  'byee',
  'byeee',
  datetime.datetime(2019, 6, 3, 19, 5, 57),
  datetime.datetime(2019, 6, 3, 19, 5, 57)),
 (344,
  15,
  'bye',
  'byee',
  'byeee',
  datetime.datetime(2019, 6, 3, 19, 12),
  datetime.datetime(2019, 6, 3, 19, 12))]

In [39]:
a = {'asjc_code': {'value': 14, 'operator': '='}, 'top': {'value': 'bye', 'operator': '<='}}
b = ' AND '.join([f'{k} {v["operator"]} {v["value"]}' for k, v in a.items()])
b

'asjc_code = 14'