<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Inititial-Record-Ingestion" data-toc-modified-id="Inititial-Record-Ingestion-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Inititial Record Ingestion</a></span><ul class="toc-item"><li><span><a href="#Connect-To-And-Re-Create-DB" data-toc-modified-id="Connect-To-And-Re-Create-DB-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Connect To And Re-Create DB</a></span></li><li><span><a href="#Read-Back-Up-Files" data-toc-modified-id="Read-Back-Up-Files-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Read Back-Up Files</a></span></li><li><span><a href="#Write-Records-To-DB" data-toc-modified-id="Write-Records-To-DB-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Write Records To DB</a></span></li><li><span><a href="#Check-The-Results" data-toc-modified-id="Check-The-Results-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Check The Results</a></span></li></ul></li><li><span><a href="#Initial-Credit-Addition" data-toc-modified-id="Initial-Credit-Addition-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Initial Credit Addition</a></span></li><li><span><a href="#Bring-import-together" data-toc-modified-id="Bring-import-together-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Bring import together</a></span></li></ul></div>

In [1]:
import configparser
import datetime as dt
import sys
import collections
from pathlib import Path
from typing import Union, Optional, List, Tuple

import codebook.EDA as EDA
import codebook.clean as clean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
from sqlalchemy import func

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.style.use('raph-base')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

np.random.seed(666)

In [3]:
sys.path.append(str(Path.cwd().parent))

from src.db_declaration2 import Base, Artist, CreditTrx, Genre, Label, Rating, Record
from src import db_functions
from src import db_connect

In [4]:
print(sys.executable)
print(sys.version)
print(f'sqlalchemy {sqlalchemy.__version__}')

C:\Users\r2d4\miniconda3\envs\py3\python.exe
3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]
sqlalchemy 1.3.17


## Inititial Record Ingestion

In [5]:
CONFIG_PATH = Path.cwd().parent / "config.yaml"

### Connect To And Re-Create DB

In [50]:
prod_path = r"sqlite:///C:\Users\r2d4\OneDrive\code\projects\20-02_disco\db_aka_discobase\DiscoBase2.db"
# dev_path = r"sqlite:///C:\Users\r2d4\OneDrive\code\projects\20-02_disco\dev\DeafDiscoBase.db"


engine = sqlalchemy.create_engine(prod_path)
session = db_connect.create_session(engine)

# Check
engine
session

Engine(sqlite:///C:\Users\r2d4\OneDrive\code\projects\20-02_disco\db_aka_discobase\DiscoBase2.db)

<sqlalchemy.orm.session.Session at 0x21141686b50>

<div class="alert alert-block alert-danger">
Attention This Will Delete All Entries In The DB!
</div>

In [7]:
# Re-Create
db_functions.create_DB_anew(engine, Base)

### Read Back-Up Files

In [8]:
def load_backup_data_from_parquet(
    config_path: Union[Path, str], 
    record_data_name: Union[Path, str],
    trx_data_name: Union[Path, str]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load the two back-up files with the record-related and 
    the credit_trx data into Pandas DataFrames.
    """
    
    back_up_params = db_connect.read_yaml(config_path, "BACK_UP")
    rel_path = back_up_params["REL_PATH"]
    target_folder = Path.cwd().parent / rel_path  # TODO CHANGE FOR PROD
    
    df_list = []
    for file in [record_data_name, trx_data_name]:
        full_path = target_folder / file
    
        df = pd.read_parquet(full_path)
        df_list.append(df)
        
    return df_list[0], df_list[1]

In [42]:
record_data, trx_data = load_backup_data_from_parquet(
    CONFIG_PATH, 
    "record_data_2021-02-14-15-58-43.parquet",
    "trx_data_2021-02-14-15-58-44.parquet"
)

In [43]:
record_data.info()
record_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308 entries, 1 to 308
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   artist          308 non-null    object        
 1   artist_country  13 non-null     object        
 2   title           308 non-null    object        
 3   genre           308 non-null    object        
 4   label           308 non-null    object        
 5   year            308 non-null    int64         
 6   record_format   308 non-null    object        
 7   vinyl_color     212 non-null    object        
 8   lim_edition     222 non-null    object        
 9   number          24 non-null     object        
 10  remarks         78 non-null     object        
 11  price           308 non-null    float64       
 12  purchase_date   308 non-null    datetime64[ns]
 13  rating          103 non-null    float64       
 14  is_digitized    308 non-null    bool          
 15  is_act

Unnamed: 0_level_0,artist,artist_country,title,genre,label,year,record_format,vinyl_color,lim_edition,number,remarks,price,purchase_date,rating,is_digitized,is_active
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Dismember,,Pieces,Death Metal,[NA],1992,"12""",,,,,35.0,1992-01-01,,False,True
2,Dismember,,Skin Her Alive,Death Metal,[NA],1991,"7"" Pic",,,,,15.0,1992-01-01,,False,True
3,Dismember,,Like An Everflowing Stream,Death Metal,[NA],1991,LP,,,,,155.0,1992-01-01,,False,True
4,Dismember,,Where Ironcrosses Flow,Death Metal,[NA],2004,LP,,,,,15.0,1992-01-01,,True,True
5,Merciless,,The Awakening,Death Metal,[NA],1989,LP,,,,"New, Wert geschätzt / 1. der beiden Originalpressungen auf DSP",250.0,1992-01-01,,True,True


In [11]:
EDA.display_nan(record_data)

Unnamed: 0,total,prop,dtype
artist_country,295,95.8%,object
number,284,92.2%,object
remarks,230,74.7%,object
rating,205,66.6%,float64
vinyl_color,96,31.2%,object
lim_edition,86,27.9%,object


In [12]:
# splits_plus_one = record_data[record_data["artist"].str.contains("/")].index
# splits_plus_one

In [13]:
# TEMPORARY: Bring Artists To List

record_data["artist_country"] = record_data["artist_country"].fillna("NA")

def split_strings(x):
    if isinstance(x, str):
        x = x.split(" / ")
    return x

record_data.loc[288, "artist_country"] = "USA / Australia"
record_data["artist"] = record_data["artist"].apply(split_strings)
record_data["artist_country"] = record_data["artist_country"].apply(split_strings)

# record_data[record_data.index.isin(splits_plus_one)]

In [14]:
record_data.to_dict("records")[0]
type(record_data.iloc[0,0][0])

{'artist': ['Dismember'],
 'artist_country': ['NA'],
 'title': 'Pieces',
 'genre': 'Death Metal',
 'label': array(['NA'], dtype=object),
 'year': 1992,
 'record_format': '12"',
 'vinyl_color': None,
 'lim_edition': None,
 'number': None,
 'remarks': None,
 'price': 35.0,
 'purchase_date': Timestamp('1992-01-01 00:00:00'),
 'rating': nan,
 'is_digitized': False,
 'is_active': True}

str

**TEMPORARY TO DOS:**

- fillna in Ratings col because it will generate an FK in the future

In [15]:
# todo

In [16]:
# SIMULATE A TRX

record_data["credit_value"] = 0
record_data["trx_type"] = "Initial Load"

### Write Records To DB

**TODO:** Can I apply()?

In [17]:
def insert_df_with_sqlalchemy_orm(session, df):
    # Add bogus trx values
    record_data["credit_value"] = 0
    record_data["trx_type"] = "Initial Load"
    
    for x in df.to_dict("records"):
        db_functions.add_new_record(session, x)


In [18]:
insert_df_with_sqlalchemy_orm(session, record_data)

  util.warn(


### Check The Results

In [19]:
# Total Number Of Records In The DB
session.query(Record).count()

308

In [20]:
session.query(Artist).count()
session.query(Artist).all()[-5:]

199

[<Artist(artist_id=195, artist_name=Celestial Sanctuary)>,
 <Artist(artist_id=196, artist_name=200 STAB WOUNDS)>,
 <Artist(artist_id=197, artist_name=Unurnment)>,
 <Artist(artist_id=198, artist_name=Infestment)>,
 <Artist(artist_id=199, artist_name=Heads For The Dead)>]

In [21]:
session.query(Genre).count()
session.query(Genre).all()

9

[<Genre(genre_id=1, genre_name=Death Metal)>,
 <Genre(genre_id=2, genre_name=Thrash Metal)>,
 <Genre(genre_id=3, genre_name=Black Metal)>,
 <Genre(genre_id=4, genre_name=Speed Metal)>,
 <Genre(genre_id=5, genre_name=Hardcore)>,
 <Genre(genre_id=6, genre_name=Punk)>,
 <Genre(genre_id=7, genre_name=Crust)>,
 <Genre(genre_id=8, genre_name=Crossover)>,
 <Genre(genre_id=9, genre_name=Grindcore)>]

In [22]:
session.query(CreditTrx).count()
session.query(func.sum(CreditTrx.credit_value)).all()[0][0]
session.query(func.min(CreditTrx.credit_value)).all()[0][0]
session.query(func.avg(CreditTrx.credit_value)).all()[0][0]

308

0.0

0.0

0.0

## Initial Credit Addition

In [23]:
# Add initial Addition trx, check the date an credit_saldo

fake_trx = CreditTrx(
    credit_trx_date=dt.datetime(year=2020, month=12, day=30),
    credit_trx_type="Addition",
    credit_value=1,
    credit_saldo=-1,
    record_id=np.nan
)
session.add(fake_trx)

In [24]:
# Check
session.commit()
session.query(CreditTrx).all()[-3:]

[<CreditTrx(credit_trx_id=307, credit_trx_date=2021-02-05, credit_trx_type=Initial Load, credit_value=0.0, credit_saldo=0.0, record_id=307)>,
 <CreditTrx(credit_trx_id=308, credit_trx_date=2021-02-08, credit_trx_type=Initial Load, credit_value=0.0, credit_saldo=0.0, record_id=308)>,
 <CreditTrx(credit_trx_id=309, credit_trx_date=2020-12-30, credit_trx_type=Addition, credit_value=1.0, credit_saldo=-1.0, record_id=None)>]

In [25]:
db_functions.add_regular_credits(session)

Creating 'Addition' Trx for: 2021-01-09
Creating 'Addition' Trx for: 2021-01-19
Creating 'Addition' Trx for: 2021-01-29
Creating 'Addition' Trx for: 2021-02-08


In [26]:
session.query(CreditTrx).all()[-3:]

[<CreditTrx(credit_trx_id=311, credit_trx_date=2021-01-19, credit_trx_type=Addition, credit_value=1.0, credit_saldo=1.0, record_id=None)>,
 <CreditTrx(credit_trx_id=312, credit_trx_date=2021-01-29, credit_trx_type=Addition, credit_value=1.0, credit_saldo=2.0, record_id=None)>,
 <CreditTrx(credit_trx_id=313, credit_trx_date=2021-02-08, credit_trx_type=Addition, credit_value=1.0, credit_saldo=3.0, record_id=None)>]

In [27]:
# session.rollback()

In [31]:
session.query(CreditTrx).count()
session.query(CreditTrx).delete()
session.query(CreditTrx).count()

313

313

0

In [57]:
    Base.metadata.drop_all(engine, tables=[CreditTrx.__table__])
    Base.metadata.create_all(engine, tables=[CreditTrx.__table__])

In [58]:
trx_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 1 to 317
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   credit_trx_date  317 non-null    datetime64[ns]
 1   credit_trx_type  317 non-null    object        
 2   credit_value     317 non-null    float64       
 3   credit_saldo     317 non-null    float64       
 4   record_id        312 non-null    float64       
 5   created_at       317 non-null    datetime64[ns]
 6   updated_at       4 non-null      datetime64[ns]
dtypes: datetime64[ns](3), float64(3), object(1)
memory usage: 19.8+ KB


In [59]:
def insert_df_with_sqlalchemy_core(df, table_class, engine):
    start = time.time()
    engine.execute(
        table_class.__table__.insert(),
        [x for x in df.to_dict("records")]
    )
    end = time.time()
    print(
        f"SQLAlchemy Core: Insert {len(df)} records in {end - start:.2f} seconds"
    )

In [60]:
import time

insert_df_with_sqlalchemy_core(trx_data.iloc[:, :-2], CreditTrx, engine)

SQLAlchemy Core: Insert 317 records in 0.02 seconds


In [61]:
session.close()

In [64]:
pd.read_sql("credit_trx", engine).tail(15)

Unnamed: 0,credit_trx_id,credit_trx_date,credit_trx_type,credit_value,credit_saldo,record_id,created_at,updated_at
302,303,2021-01-15,Purchase,-1.0,-4.0,300.0,2021-02-16 20:10:35,NaT
303,304,2021-01-19,Addition,1.0,-3.0,,2021-02-16 20:10:35,NaT
304,305,2021-01-20,Purchase,-1.0,-4.0,301.0,2021-02-16 20:10:35,NaT
305,306,2021-01-20,Purchase,-1.0,-5.0,302.0,2021-02-16 20:10:35,NaT
306,307,2021-01-22,Remove,1.0,-4.0,127.0,2021-02-16 20:10:35,NaT
307,308,2021-01-22,Remove,0.0,-4.0,123.0,2021-02-16 20:10:35,NaT
308,309,2021-01-22,Remove,1.0,-3.0,121.0,2021-02-16 20:10:35,NaT
309,310,2021-01-25,Purchase,-1.0,-4.0,303.0,2021-02-16 20:10:35,NaT
310,311,2021-01-25,Purchase,0.0,-4.0,304.0,2021-02-16 20:10:35,NaT
311,312,2021-01-29,Addition,1.0,-3.0,,2021-02-16 20:10:35,NaT


## Bring Import Functions together

In [66]:
def _insert_record_data_with_sqlalchemy_orm(
    session: sqlalchemy.orm.session.Session, 
    df:pd.DataFrame = record_data
):
    """XXX"""
    # Add bogus trx values
    record_data["credit_value"] = 0
    record_data["trx_type"] = "Initial Load"
    
    for x in df.to_dict("records"):
        db_functions.add_new_record(session, x)
        

def _truncate_credit_trx_table(
    engine: sqlalchemy.engine.Engine,
    table_class: sqlalchemy.ext.declarative.api.DeclarativeMeta = CreditTrx
):
    """XXX"""
    Base.metadata.drop_all(engine, tables=[table_class.__table__])
    Base.metadata.create_all(engine, tables=[table_class.__table__])
    

def _insert_trx_data_with_sqlalchemy_core(
    engine: sqlalchemy.engine.Engine,
    df: pd.DataFrame = trx_data,
    table_class: sqlalchemy.ext.declarative.api.DeclarativeMeta = CreditTrx
):
    """XXX"""
    engine.execute(
        table_class.__table__.insert(),
        [x for x in df.to_dict("records")]
    )


In [65]:
type(CreditTrx)

sqlalchemy.ext.declarative.api.DeclarativeMeta

In [67]:
session.close()