In [1]:
import configparser
import datetime as dt
import sys
import collections
from pathlib import Path
from typing import Union, Optional, Iterable, List, Tuple

import codebook.EDA as EDA
import codebook.clean as clean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
from sqlalchemy import func, distinct

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.style.use('raph-base')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

np.random.seed(666)

In [3]:
sys.path.append(str(Path.cwd().parent))

from src.db_declaration2 import (
    Base, Artist, CreditTrx, Genre, Label, Record, RecordFormat, 
    ArtistRecordLink, ArtistGenreLink, GenreLabelLink, LabelRecordLink
)
from src import db_functions
from src import db_connect

CONFIG_PATH = Path.cwd().parent / "config.yaml"

In [4]:
print(sys.executable)
print(sys.version)
print(f'sqlalchemy {sqlalchemy.__version__}')

C:\Users\r2d4\miniconda3\envs\py3\python.exe
3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]
sqlalchemy 1.3.17


## Connect To And (Re-)Create DB

Because we connect from `dev`subfolder, I have to work with the sqlalchemy `create_engine` function directly.

In [5]:
db_params = {"REL_PATH": "DeafDiscoBase.db"}
engine = db_connect.create_engine(db_params)
session = db_connect.create_session(engine)

# Check
engine
session

Engine(sqlite:///C:\Users\r2d4\OneDrive\code\projects\20-02_disco\dev\DeafDiscoBase.db)

<sqlalchemy.orm.session.Session at 0x22b5c8e5610>

In [6]:
db_functions.create_DB_anew(engine, Base)

In [7]:
# COPIED FROM PROD_EXPORT

def load_backup_data_from_parquet(
    config_path: Union[Path, str], 
    record_data_name: Union[Path, str],
    trx_data_name: Union[Path, str]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load the two back-up files with the record-related and 
    the credit_trx data into Pandas DataFrames.
    """
    
    back_up_params = db_connect.read_yaml(config_path, "BACK_UP")
    rel_path = back_up_params["REL_PATH"]
    target_folder = Path.cwd().parent / rel_path  # TODO CHANGE FOR PROD
    
    df_list = []
    for file in [record_data_name, trx_data_name]:
        full_path = target_folder / file
    
        df = pd.read_parquet(full_path)
        df_list.append(df)
        
    return df_list[0], df_list[1]

In [8]:
record_data, trx_data = load_backup_data_from_parquet(
    CONFIG_PATH, 
    "record_data_2021-02-14-15-58-43.parquet",
    "trx_data_2021-02-14-15-58-44.parquet"
)


# TEMPORARY: Bring Artists To List

record_data["artist_country"] = record_data["artist_country"].fillna("NA")

def split_strings(x):
    if isinstance(x, str):
        x = x.split(" / ")
    return x

record_data["artist"] = record_data["artist"].apply(split_strings)
record_data["artist_country"] = record_data["artist_country"].apply(split_strings)


# SIMULATE A TRX

record_data["credit_value"] = 0
record_data["trx_type"] = "Initial Load"

In [9]:
record_df = record_data[record_data.index.isin([1, 301])]

In [10]:
record_df.info()
record_df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 1 to 301
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   artist          2 non-null      object        
 1   artist_country  2 non-null      object        
 2   title           2 non-null      object        
 3   genre           2 non-null      object        
 4   label           2 non-null      object        
 5   year            2 non-null      int64         
 6   record_format   2 non-null      object        
 7   vinyl_color     1 non-null      object        
 8   lim_edition     1 non-null      object        
 9   number          0 non-null      object        
 10  remarks         0 non-null      object        
 11  price           2 non-null      float64       
 12  purchase_date   2 non-null      datetime64[ns]
 13  rating          0 non-null      float64       
 14  is_digitized    2 non-null      bool          
 15  is_activ

Unnamed: 0_level_0,artist,artist_country,title,genre,label,year,record_format,vinyl_color,lim_edition,number,remarks,price,purchase_date,rating,is_digitized,is_active,credit_value,trx_type
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,[Dismember],[NA],Pieces,Death Metal,[NA],1992,"12""",,,,,35.0,1992-01-01,,False,True,0,Initial Load
301,"[Coffins, Depression]","[Japan, Germany]",Split,Death Metal,[Hells Headbangers],2021,LP,white with black splatter,100.0,,,20.0,2021-01-20,,True,True,0,Initial Load


In [11]:
def insert_df_with_sqlalchemy_orm(session, df):
    for x in df.to_dict("records"):
        print(x)
        db_functions.add_new_record(session, x)

In [12]:
insert_df_with_sqlalchemy_orm(session, record_df)

{'artist': ['Dismember'], 'artist_country': ['NA'], 'title': 'Pieces', 'genre': 'Death Metal', 'label': array(['NA'], dtype=object), 'year': 1992, 'record_format': '12"', 'vinyl_color': None, 'lim_edition': None, 'number': None, 'remarks': None, 'price': 35.0, 'purchase_date': Timestamp('1992-01-01 00:00:00'), 'rating': nan, 'is_digitized': False, 'is_active': True, 'credit_value': 0, 'trx_type': 'Initial Load'}
{'artist': ['Coffins', 'Depression'], 'artist_country': ['Japan', 'Germany'], 'title': 'Split', 'genre': 'Death Metal', 'label': array(['Hells Headbangers'], dtype=object), 'year': 2021, 'record_format': 'LP', 'vinyl_color': 'white with black splatter', 'lim_edition': '100', 'number': None, 'remarks': None, 'price': 20.0, 'purchase_date': Timestamp('2021-01-20 00:00:00'), 'rating': nan, 'is_digitized': True, 'is_active': True, 'credit_value': 0, 'trx_type': 'Initial Load'}


  util.warn(


### Insertion of 2 Credit Addition Trx

In [14]:
# to_delete = session.query(CreditTrx).filter(CreditTrx.credit_trx_id == 8).one()
# session.delete(to_delete)
# session.commit()

In [15]:
# Initial trx, 11 days ago

addition_trx = CreditTrx(
    credit_trx_date=dt.datetime.today().date() - dt.timedelta(11),
    credit_trx_type="Addition",
    credit_value=1,
    credit_saldo=1,
    record_id=np.nan
)
session.add(addition_trx)

# And a regular interval addition
db_functions.add_regular_credits(session)

session.commit()

Creating 'Addition' Trx for: 2021-02-15


In [16]:
session.query(CreditTrx).all()

[<CreditTrx(credit_trx_id=1, credit_trx_date=1992-01-01, credit_trx_type=Initial Load, credit_value=0.0, credit_saldo=0.0, record_id=1)>,
 <CreditTrx(credit_trx_id=2, credit_trx_date=2021-01-20, credit_trx_type=Initial Load, credit_value=0.0, credit_saldo=0.0, record_id=2)>,
 <CreditTrx(credit_trx_id=3, credit_trx_date=2021-02-05, credit_trx_type=Addition, credit_value=1.0, credit_saldo=1.0, record_id=None)>,
 <CreditTrx(credit_trx_id=4, credit_trx_date=2021-02-15, credit_trx_type=Addition, credit_value=1.0, credit_saldo=2.0, record_id=None)>]

### Insertion of new Record

In [17]:
# db_functions.add_new_record(session, test_record)

# assert session.query(Record).count() == 4

In [18]:
# session.query(CreditTrx).all()[-2:]

### Removal of Existing Record

Necessary cols: trx_type, credit_value, title, artist, date

In [19]:
# test_removal = {
#     "trx_type": "Remove",
#     "credit_value": 1,
#     "artist": "Emperor",
#     "title": "s/t",
# #     "year": 1993,
#     "removal_date": dt.datetime.today().date()
# }

In [20]:
# db_functions.set_record_to_inactive(session, test_removal)

In [21]:
# session.query(func.count(distinct(Record.active))).all()
# session.query(func.count(Record.record_id)).group_by(Record.active).all()

In [22]:
# session.query(CreditTrx).all()[-3:]

### Reactivation of inactive Record [OPEN]

In [23]:
# ATTENTION It has to be possible ro re-add inactive records! (and to pay for it in credits!)

## Query DB

### Check Tables

In [24]:
pd.read_sql("records", engine)

Unnamed: 0,record_id,title,year,genre_id,format_id,vinyl_color,lim_edition,number,remarks,purchase_date,price,rating,is_digitized,is_active,created_at,updated_at
0,1,Pieces,1992,1,1,,,,,1992-01-01,35,,0,1,2021-02-16 16:28:17,NaT
1,2,Split,2021,1,2,white with black splatter,100.0,,,2021-01-20,20,,1,1,2021-02-16 16:28:17,NaT


In [25]:
pd.read_sql("artists", engine)

Unnamed: 0,artist_id,artist_name,artist_country,created_at,updated_at
0,1,Dismember,,2021-02-16 16:28:17,NaT
1,2,Coffins,Japan,2021-02-16 16:28:17,NaT
2,3,Depression,Germany,2021-02-16 16:28:17,NaT


In [26]:
pd.read_sql("artist_record_link", engine)

Unnamed: 0,artist_id,record_id
0,1,1
1,2,2
2,3,2


In [27]:
pd.read_sql("genres", engine)

Unnamed: 0,genre_id,genre_name,created_at,updated_at
0,1,Death Metal,2021-02-16 16:28:17,NaT


In [28]:
pd.read_sql("artist_genre_link", engine)

Unnamed: 0,artist_id,genre_id
0,1,1
1,2,1
2,3,1


In [29]:
pd.read_sql("formats", engine)

Unnamed: 0,format_id,format_name,created_at,updated_at
0,1,"12""",2021-02-16 16:28:17,NaT
1,2,LP,2021-02-16 16:28:17,NaT


In [30]:
pd.read_sql("labels", engine)

Unnamed: 0,label_id,label_name,created_at,updated_at
0,1,,2021-02-16 16:28:17,NaT
1,2,Hells Headbangers,2021-02-16 16:28:17,NaT


In [31]:
pd.read_sql("label_record_link", engine)

Unnamed: 0,label_id,record_id
0,1,1
1,2,2


In [32]:
pd.read_sql("artist_label_link", engine)

Unnamed: 0,artist_id,label_id
0,1,1
1,2,2
2,3,2


### Miscelaneous Queries

In [33]:
artist_test = ["Dismember"]
title_test = "Pieces"

def fetch_a_record_from_the_shelf(
    session: sqlalchemy.orm.session.Session, artist: Iterable[str], title: str
) -> sqlalchemy.orm.query.Query:
    """Query a record by title, artist and (optional) year,
    Return the query result object. Returns None if no record is
    found, raises an error if more than one record is matched.
    """
    record = (
        session.query(Record)
        .filter(
            (Record.title.ilike(title)),
            (Record.artists.any(artist_name = artist[0]))
        )
        .one_or_none()
    )
    return record


In [34]:
r = fetch_a_record_from_the_shelf(session, artist_test, title_test)
print(r)

<Record(record_id=1, title=Pieces, artist=[<Artist(artist_id=1, artist_name=Dismember)>])>


In [None]:
# TODO: On Updates the old values are not overwritten in many to many relationships (-->Labels) ... new ones are added
# I cound make a correction function that enables updates on many to many entities (labels, genres, artists)

In [35]:
session.close()

In [36]:
brk

NameError: name 'brk' is not defined

# Export

In [None]:
for result in session.query(Record).all():
    print(result)

In [None]:
result = session.query(Record).filter(Record.record_id == 4).one_or_none()

# Check
result

In [None]:
# Note for the first export this is still many-to-one, will change to many-to-many in the future

# artist_tuple = (
#     session
#     .query(Artist.artist_name, Artist.artist_country)
#     .join(Record, Record.artist_id == Artist.artist_id)
#     .filter(Artist.artist_id == result.artist_id)
#     .first()
# )            

# # Check
# print(artist_tuple[0])
# print(artist_tuple[1])

result.artist.artist_name
result.artist.artist_country

In [None]:
# One-to-many

# record_format = (
#     session
#     .query(RecordFormat.format_name)
#     .join(Record, Record.format_id == RecordFormat.format_id)
#     .filter(RecordFormat.format_id == result.format_id)
#     .first()[0]
# )            


result.record_format.format_name

In [None]:
# many-to-many

# Check
[label.label_name for label in result.labels]
result.labels[0].label_name

In [None]:
record_data_dict = {
    "record_id": result.record_id,
    "artist": result.artist.artist_name,  # has to be adapted to many-to-many
    "artist_country": result.artist.artist_country,  # has to be adapted to many-to-many
    "title": result.title,
    "genre": result.genre.genre_name,
    "label": [label.label_name for label in result.labels],
    "year": result.year,
    "record_format": result.record_format.format_name,
    "vinyl_color": result.vinyl_color,
    "lim_edition": result.lim_edition,
    "number": result.number,
    "remarks": result.remarks,
    "price": result.price,
    "digitized": result.digitized,
    "rating": result.rating, # has to be datapted to one-to-many
    "is_active": result.active,
    "purchase_date": result.purchase_date,
}

In [None]:
record_data_dict

In [None]:
result_list = session.query(Record).order_by(Record.record_id).all()
dict_list = []
for result in result_list:
    record_data_dict = {
    "record_id": result.record_id,
    "artist": result.artist.artist_name,  # has to be adapted to many-to-many
    "artist_country": result.artist.artist_country,  # has to be adapted to many-to-many
    "title": result.title,
    "genre": result.genre.genre_name,
    "label": [label.label_name for label in result.labels],
    "year": result.year,
    "record_format": result.record_format.format_name,
    "vinyl_color": result.vinyl_color,
    "lim_edition": result.lim_edition,
    "number": result.number,
    "remarks": result.remarks,
    "price": result.price,
    "digitized": result.digitized,
    "rating": result.rating, # has to be datapted to one-to-many
    "is_active": result.active,
    "purchase_date": result.purchase_date,
    }
    dict_list.append(record_data_dict)

records_df = pd.DataFrame(dict_list, columns=dict_list[0].keys())
records_df.set_index('record_id', drop=True, inplace=True) 

assert records_df.index.is_monotonic_increasing & records_df.index.is_unique

In [None]:
records_df.head()

In [None]:
credit_trx_df = pd.read_sql("credit_trx", engine)
credit_trx_df.set_index('credit_trx_id', drop=True, inplace=True) 

assert credit_trx_df.index.is_monotonic_increasing & credit_trx_df.index.is_unique

In [None]:
credit_trx_df.head()

In [None]:
def export_db_data_to_2_parquet_files(session, engine, config_path):
    """Create 2 tabular parquet files, one with record-related
    data (incl. information on artists, genres, labels), and one that 
    is a copy of the `credit_trx` table. With help of these files the 
    database can be repopulated after a complete reset.
    """
    
    record_df_tuple = _save_record_related_data_to_df(session)
    credit_trx_df_tuple = _save_credit_trx_table_to_df(engine)
    
    for df_tuple in [record_df_tuple, credit_trx_df_tuple]:
        _save_df_to_parquet(df_tuple, config_path)
    
    
def _save_record_related_data_to_df(session) -> Tuple[str, pd.DataFrame]:
    """Save all record-related data to Pandas Dataframe and return a tuple
    with a dataframe name string and the dataframe. Called within 
    `export_db_data_to_2_parquet_files`.
    """
    result_list = session.query(Record).order_by(Record.record_id).all()
    dict_list = []
    
    for result in result_list:
        record_data_dict = {
        "record_id": result.record_id,
        "artist": result.artist.artist_name,  # TODO: has to be adapted to many-to-many
        "artist_country": result.artist.artist_country,  #TODO: has to be adapted to many-to-many
        "title": result.title,
        "genre": result.genre.genre_name,
        "label": [label.label_name for label in result.labels],
        "year": result.year,
        "record_format": result.record_format.format_name,
        "vinyl_color": result.vinyl_color,
        "lim_edition": result.lim_edition,
        "number": result.number,
        "remarks": result.remarks,
        "price": result.price,
        "digitized": result.digitized,
        "rating": result.rating, # TODO: has to be datapted to one-to-many
        "is_active": result.active,
        "purchase_date": result.purchase_date,
        }
        dict_list.append(record_data_dict)

    records_df = pd.DataFrame(dict_list, columns=dict_list[0].keys())
    records_df.set_index('record_id', drop=True, inplace=True) 
    df_name = "record_data"

    if not records_df.index.is_monotonic_increasing and not records_df.index.is_unique:
        raise AssertionError("record_ids are messed up, please check data.")
    
    return df_name, records_df

    
def _save_credit_trx_table_to_df(engine) -> Tuple[str, pd.DataFrame]:
    """Copy credit_trx_table to Pandas Dataframe and return a tuple
    with a dataframe name string and the dataframe. Called within 
    `export_db_data_to_2_parquet_files`.
    """
    credit_trx_df = pd.read_sql("credit_trx", engine)
    credit_trx_df.set_index('credit_trx_id', drop=True, inplace=True)
    df_name = "trx_data"

    if not credit_trx_df.index.is_monotonic_increasing and not credit_trx_df.index.is_unique:
        raise AssertionError("record_ids are messed up, please check data.")

    return df_name, credit_trx_df


def _save_df_to_parquet(df_tuple: Tuple[str, pd.DataFrame], config_path: Path):
    """Create date and timestamped directory and file name at path
    defined in config.yaml and save dataframe as back-up to parquet.
    Called within `export_db_data_to_2_parquet_files`.
    """
    df_name, df = df_tuple
    date_stamp = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d")
    datetime_stamp = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d-%H-%M-%S")

    back_up_params = db_connect.read_yaml(CONFIG_PATH, "BACK_UP")
    rel_path = back_up_params["REL_PATH"]
    target = Path.cwd() / rel_path / f"{date_stamp}"
    Path.mkdir(target, parents=True, exist_ok=True)

    full_path = target / f"{df_name}_{datetime_stamp}.parquet"
    df.to_parquet(full_path)