In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
import pandas as pd
#add parent dir to module search path to find readQueryCursor module
sys.path.append(os.path.dirname(os.getcwd()))
import module.readQueryCursor as rqc
import module.readStatistics as rs

# enter database connection information
dbname = 'testdb'
user = 'postgres'
password = 'password'
textInvInclusion = []
textInvExclusion = ['IliadRef','4test', '8test', 'Need CKN', 'to delete', 'test']

#setup readQueryCursor connection configuration
conf = rqc.readConnConfig.copy()
conf['database'] = dbname
conf['user'] = user
conf['password'] = password
#create a Read Query Cursor connected to the database
myRQC = rqc.ReadQueryCursor(conf)

# set default connection for ReadStatsHelper
rs.setReadStatsConnParameter(key='database', value=dbname)
rs.setReadStatsConnParameter(key='user', value=user)
rs.setReadStatsConnParameter(key='password', value=password)
myRSH = rs.ReadStatisticsHelper(myRQC)

def getNextOutputFilename(path = "log", ext="txt", limit = 10):
  i = 0
  filepath = path + "." + ext
  while os.path.exists(filepath) and i < limit:
    i += 1
    filepath = path + f"({1})" + "." + ext
  return filepath

# build query for text
query = "SELECT * FROM text WHERE not txt_owner_id = 1"
if len(textInvInclusion > 0):
  strInclusionList = "', '".join(textInvInclusion)
  query += f" AND txt_ckn in ('{strInclusionList}')"
if len(textInvExclusion) > 0:
  strExclusionList = "', '".join(textInvExclusion)
  query += f" AND not txt_ckn in ('{strExclusionList}')"
query += " ORDER BY txt_ckn;"
# get text by invNum from database
textsByInvNum = {}
txtInvNumList = []
myRQC.query(query)
if myRQC.hasError():
  print(f"Error encountered while running query: '{myRQC.getQuery()}'")
else:
  textsByInvNum = myRQC.getRowsAsIndexDict(myRQC.getColumnNames().index('txt_ckn'))
  dfTexts = pd.DataFrame(textsByInvNum)
  dfTexts.head()
  outFilename = getNextOutputFilename(path = f"output/texts_info_{dbname}", ext = ".csv")
  dfTexts.to_csv(outFilename)
  txtInvNumList = list(textsByInvNum.keys())
  print(f"Found {len(txtInvNumList)} text inventory numbers")


## TODO:
- Create library with functions that select output type HTML, json, xslx, csv, ... and return as stream to browser or download as file.
- Add total column for multi column counts.

In [None]:
graCountsByTxtInv = {}
for txtInv in txtInvNumList:
  graCountsByTxtInv[txtInv] = myRSH.getGraphemeCountsByText(txtInv)
dfGraphemeCounts = pd.DataFrame(graCountsByTxtInv)
dfGraphemeCounts.head()
outFilename = getNextOutputFilename(path = f"output/grapheme_counts_by_text_{dbname}", ext = "csv")
dfGraphemeCounts.to_csv(outFilename)


In [None]:
graCountsByImageName = myRSH.getGraphemeCountsByImage()
dfGraphemeCountsByImage = pd.DataFrame(graCountsByImageName)
dfGraphemeCountsByImage.head()
outFilename = getNextOutputFilename(path = f"output/grapheme_counts_by_image_name_{dbname}", ext = "csv")
dfGraphemeCountsByImage.to_csv(outFilename)


In [None]:
featureTypeName = 'basetype'
featureValue = '1'
featureTypeTagID = myRQC.getTermIDStrict(featureValue,featureTypeName)
featureValueShortName = myRQC.getTermCode(featureTypeTagID)
graCountsByImageByTagType = myRSH.getGraphemeCountsByImage(featureTypeTagID)
dfGraphemeCountsByTagType = pd.DataFrame(graCountsByImageByTagType)
dfGraphemeCountsByTagType.head()
dfGraphemeCountsByTagType.to_csv(f"output/{featureValueShortName}_grapheme_counts_by_image_name_{dbname}.csv")