### Set up the app

In [None]:
from receipts.app import init_app
from receipts.models import Document, Label, DataModel, DataClass, User, PendingLabel, PendingLabelResponse, db

app = init_app()

### Set up models

In [None]:
model = DataModel.get_or_create(name="receipts")
model.longname = "Document Type"
model = DataModel.get_or_create(name="prices")
model.longname = "Price Type"
    

db.session.flush()

### Import Data from Labeling Party

In [None]:
import json
import os
from collections import defaultdict
from glob import glob

from sqlalchemy.exc import IntegrityError
    
users = {}
documents = {}

for model in DataModel.query:
    print("Collecting data for {}...".format(model.name))
    data_classes = {}
    responses = {}
    for datadir in glob("party-outputs/[0-9]-*/"):    
        username = os.path.basename(datadir.strip("/")).split("-")[1]
        if username not in users:
            users[username] = User(username=username)
            db.session.add(users[username])
        user = users[username]
        
        with open(os.path.join(datadir, model.name, "labels.dat")) as label_file:
            for line in label_file:
                data = json.loads(line)
                receipt_path = os.path.join("receipt_data/party", os.path.basename(data[0]))
                
                if receipt_path not in documents:
                    documents[receipt_path] = Document(path=receipt_path)
                    db.session.add(documents[receipt_path])
                document = documents[receipt_path]
                
                for position, classno in data[1]:
                    if classno not in data_classes:
                        data_classes[classno] = DataClass(model=model, classno=classno)
                        db.session.add(data_classes[classno])
                        
                    data_class = data_classes[classno]
                    
                    responses[(document, position, user)] = data_class
    
    print("Storing data for {}...".format(model.name))
    pending_labels = {}
    for (document, position, user), data_class in responses.items():
        if (document, position) not in pending_labels:
            pending_labels[(document, position)] = PendingLabel(document=document, position=position, model=model)
            db.session.add(pending_labels[(document, position)])

        pl = pending_labels[(document, position)]

        db.session.add(PendingLabelResponse(data_class=data_class, user=user, pending_label=pl))
                
    db.session.flush()

### Store Label for PendingLabels where there is agreement

In [None]:
for pl in PendingLabel.query.options(db.joinedload(PendingLabel.responses)):
    if len(pl.responses) > 1 and len(set(r.data_class_id for r in pl.responses)) == 1:
        response = pl.responses[0]
        db.session.add(Label(
            document_id=pl.document_id,
            position=pl.position,
            data_class_id=response.data_class_id))

db.session.flush()

### Name the classes

In [None]:
print(list(DataClass.query.values(DataClass.classno, DataClass.name)))

RECEIPT_CLASS_NAMES = [
    "other",
    "bill",
    "cc_slip",
    "closed_receipt",
]

model = DataModel.query.filter_by(name="receipts").first()
for data_class in model.classes:
    data_class.name = RECEIPT_CLASS_NAMES[data_class.classno]

PRICE_CLASS_NAMES = [
   "unknown",
   "subtotal",
   "total",
]

model = DataModel.query.filter_by(name="prices").first()
for data_class in model.classes:
    data_class.name = PRICE_CLASS_NAMES[data_class.classno]
    
db.session.flush()

### Generate zipfile of receipts and (csv) labels

In [None]:
import csv
import os
import zipfile
from tempfile import NamedTemporaryFile

def write_zip(myzip, documents, model, include_position=True):
    with NamedTemporaryFile(mode="w") as csv_file:
        csv_writer = csv.writer(csv_file)

        row = ["filename", "label"]
        if include_position:
            row.append("position")
        csv_writer.writerow(row)

        for cls in model.classes:
            for label in cls.labels:
                document = documents[label.document_id] if label.document_id in documents else label.document
                path = document.path
                
                row = [os.path.basename(path), cls.classno]
                if include_position:
                    row.append(label.position)

                csv_writer.writerow(row)

                if document.id not in documents:
                    myzip.write(path, os.path.join("receipts", os.path.basename(path)))
                    documents[document.id] = document

        csv_file.flush()
        myzip.write(csv_file.name, "{}/labels.csv".format(model.longname))
        csv_file.seek(0)
        csv_file.truncate()

        csv_writer = csv.writer(csv_file)

        csv_writer.writerow(["class_id", "class_name"])
        classes = model.classes.order_by(DataClass.classno).values(DataClass.classno, DataClass.name)
        for number, name in classes:
            csv_writer.writerow([number, name])
        
        csv_file.flush()
        myzip.write(csv_file.name, "{}/classes.csv".format(model.longname))
    
with zipfile.ZipFile("data.zip", "w") as myzip:       
    documents = {}
    write_zip(myzip, documents, DataModel.query.filter_by(name="receipts").first(), include_position=False)
    write_zip(myzip, documents, DataModel.query.filter_by(name="prices").first())

### Delete all data from database (utility)

In [None]:
Document.query.delete()
Label.query.delete()
DataClass.query.delete()
PendingLabel.query.delete()
PendingLabelResponse.query.delete()
User.query.delete()

### Print receipts with a given label (utility)

In [None]:
receipts = DataModel.query.filter_by(name="receipts").first()
other = receipts.classes.filter_by(name="closed_receipt").first()

for label in other.labels:
    print(label.document.path)
    print(open(label.document.path).read())
    
    if input() == "stop":
        break
    