# DAE Thesis Topic Database crawler

In [95]:
import numpy
import matplotlib.pyplot as plt
import json
import sys
import os
from os.path import exists
from pathlib import Path


def read_db(path='.'):
    """Find all JSON files in path.
    Assemble a list of entries for further processing, e.g. filtering.
    :param path: path where to look for JSON files, defaults to '.'
    :type path: str
    :return: database of all thesis topics
    :rtype: list
    """
    db = []
    json_list = []
    # step 1: build a list of all json files in the folder
    json_list = list(Path(path).rglob('*.json'))
    # step 2: read each of the json files one by one
    # step 3: append them alongside the filename into list
    for json_file in json_list:
        db.append({'file': json_file, **json.loads(json_file.read_bytes())})
    # step 4: return the database
    return db


def check_db(db):
    """Check if the DB contains year, month, day field and title for each entry
    and if the PDF file exists

    :param db: list extracted form the JSON files
    :type db: list
    :return: False if the date is not available or if the PDF file
        is nonexisting (or has the wrong name)
    :rtype: bool
    """
    for x in db:
        sc = ('year' in x) and ('month' in x) and ('day' in x)
        if (not sc):
            return False
        fname = x['file'].with_suffix('.pdf')
        print(fname)
        if (not exists(fname)):
            return False
    return True


def filter_db(db, field, value, exact_match=False):
    """filter the database by values of a certain field
    :param db: [description]
    :type db: [type]
    :param field: name of the field to apply the filter to
    :type field: string
    :param value: value to check
    :type value: misc
    :param exact_match: check for 100% matching string (if True);
        else: check if string is contained, defaults to False
    :type exact_match: bool, optional
    :return: [description]
    :rtype: [type]
    """
    idx = []
    for i, db_element in enumerate(db):
        sc = False
        if (exact_match):
            sc = db_element[field] == value
        else:
            if type(db_element[field]) == str:
                sc = value in db_element[field]
            else:
                sc = db_element[field] == value
        if (sc):
            idx.append(i)

    return idx


def filter_db_date(db, start_date={'year': 2020, 'month': 1, 'day': 1}, end_date={'year': 2999, 'month': 12, 'day': 31}):
    idx = []
    start = start_date['year'] * 372 + start_date['month'] * 31 + start_date['day']
    end = end_date['year'] * 372 + end_date['month'] * 31 + end_date['day']
    
    for i, db_element in enumerate(db):
        sc = False
        d = db_element['year'] * 372 + \
            db_element['month'] * 31 + db_element['day']
        if (d >= start and d <= end):
            idx.append(i)

    return idx

In [98]:
# Demo 1: build a database based on the files inside the folder called "./current"
db = read_db('./current')
check_db(db)
filter_db(db,'email','alameddin',False)
# filter_db_date( db,start_date = { 'year': 2020, 'month': 1, 'day': 1 }, end_date = { 'year': 2999, 'month': 12, 'day': 31 })

current/SA_AIF_Indentation_LS-Dyna/SA_MSc_indentation_test_ff.pdf


[0]