In [1]:
from receipts.app import init_app
from receipts.models import Document, Label, DataModel, DataClass, Disagreement, db

app = init_app()

In [3]:
model = DataModel.get_or_create(name="receipts")
model.longname = "Document Type"
model = DataModel.get_or_create(name="prices")
model.longname = "Price Type"
    

db.session.flush()

In [3]:
import json
import os
from collections import defaultdict
from glob import glob

for model in DataModel.query:
    labels = defaultdict(list)
    for datadir in glob("party-outputs/[0-9]-*/"):
        labels_path = os.path.join(datadir, model.name, "labels.dat")
        lines = {}
    
        with open(labels_path) as label_file:
            for line in label_file:
                data = json.loads(line)
                receipt_path = os.path.join("receipt_data/party", os.path.basename(data[0]))
                for position, label in data[1]:
                    labels[(receipt_path, position)].append(label)
    

    count = 0
    documents = {}
    data_classes = {}
    for ((receipt_path, position), responses) in labels.items():
        count += 1
        if count % 500 == 0:
            print("{} / {}".format(count, len(labels)))
            
        if receipt_path not in documents:
            documents[receipt_path] = Document.get_or_create(path=receipt_path)
            
        if len(set(responses)) == 1 and len(responses) > 1:
            classno = responses[0]
            
            if classno not in data_classes:
                data_classes[classno] = DataClass.get_or_create(model=model, classno=classno)
        
            db.session.add(Label(
                document=documents[receipt_path],
                position=position,
                data_class=data_classes[classno]))               
        else:
            db.session.add(Disagreement(
                model=model,
                document=documents[receipt_path],
                position=position,
                responses = ",".join(map(str, responses))))
                    
    db.session.flush()

250 / 1904
500 / 1904
750 / 1904
1000 / 1904
1250 / 1904
1500 / 1904
1750 / 1904
2000 / 14658
2250 / 14658
2500 / 14658
2750 / 14658
3000 / 14658
3250 / 14658
3500 / 14658
3750 / 14658
4000 / 14658
4250 / 14658
4500 / 14658
4750 / 14658
5000 / 14658
5250 / 14658
5500 / 14658
5750 / 14658
6000 / 14658
6250 / 14658
6500 / 14658
6750 / 14658
7000 / 14658
7250 / 14658
7500 / 14658
7750 / 14658
8000 / 14658
8250 / 14658
8500 / 14658
8750 / 14658
9000 / 14658
9250 / 14658
9500 / 14658
9750 / 14658
10000 / 14658
10250 / 14658
10500 / 14658
10750 / 14658
11000 / 14658
11250 / 14658
11500 / 14658
11750 / 14658
12000 / 14658
12250 / 14658
12500 / 14658
12750 / 14658
13000 / 14658
13250 / 14658
13500 / 14658
13750 / 14658
14000 / 14658
14250 / 14658
14500 / 14658
14750 / 14658
15000 / 14658
15250 / 14658
15500 / 14658
15750 / 14658
16000 / 14658
16250 / 14658
16500 / 14658


In [6]:
import csv
import os
import zipfile
from tempfile import NamedTemporaryFile

def write_zip(myzip, written_files, model, include_position=True):
    with NamedTemporaryFile(mode="w") as csv_file:
        csv_writer = csv.writer(csv_file)

        row = ["filename", "label"]
        if include_position:
            row.append("position")
        csv_writer.writerow(row)

        for cls in model.classes:
            for label in cls.labels:
                path = label.document.path

                row = [os.path.basename(path), cls.classno]
                if include_position:
                    row.append(label.position)

                csv_writer.writerow(row)

                if path not in written_files:
                    myzip.write(path, os.path.join("receipts", os.path.basename(path)))

                written_files.add(path)

        csv_file.flush()
        myzip.write(csv_file.name, "{}/labels.csv".format(model.longname))
        csv_file.seek(0)
        csv_file.truncate()

        csv_writer = csv.writer(csv_file)

        csv_writer.writerow(["class_id", "class_name"])
        classes = model.classes.order_by(DataClass.classno).values(DataClass.classno, DataClass.name)
        for number, name in classes:
            csv_writer.writerow([number, name])
        
        csv_file.flush()
        myzip.write(csv_file.name, "{}/classes.csv".format(model.longname))
    
with zipfile.ZipFile("for_charniak.zip", "w") as myzip:       
    written_files = set()
    write_zip(myzip, written_files, DataModel.query.filter_by(name="receipts").first(), include_position=False)
    write_zip(myzip, written_files, DataModel.query.filter_by(name="prices").first())

In [2]:
Document.query.delete()
Label.query.delete()
DataClass.query.delete()
Disagreement.query.delete()

82

In [17]:
print(list(DataClass.query.values(DataClass.classno, DataClass.name)))

RECEIPT_CLASS_NAMES = [
    "other",
    "bill",
    "cc_slip",
    "closed_receipt",
]

model = DataModel.query.filter_by(name="receipts").first()
for data_class in model.classes:
    data_class.name = RECEIPT_CLASS_NAMES[data_class.classno]

PRICE_CLASS_NAMES = [
   "unknown",
   "subtotal",
   "total",
]

model = DataModel.query.filter_by(name="prices").first()
for data_class in model.classes:
    data_class.name = PRICE_CLASS_NAMES[data_class.classno]
    
db.session.flush()

[(1, 'bill'), (0, 'other'), (3, 'closed_receipt'), (2, 'cc_slip'), (0, 'unknown'), (1, 'subtotal'), (2, 'total')]
