In [None]:
import duckdb

duckdb.sql("SELECT * FROM '../data/staging/stg_expenditures.parquet' LIMIT 5").df().columns

: 

In [2]:
expenditure_df = duckdb.sql(
    """
SELECT
    expenditure_description,
    expenditure_purpose,
    expenditure_extra_description,
    expenditure_last_name_or_org
FROM '../data/staging/stg_expenditures.parquet'
    """
).df()

expenditure_df.head()

Unnamed: 0,expenditure_description,expenditure_purpose,expenditure_extra_description,expenditure_last_name_or_org
0,CREDIT CARD FEE,PROCESSING FEE,,WINRED
1,PRINT ADVERTISING,PRINT AND MANAGEMENT COSTS,,STUDIOONE CREATIVE
2,CREDIT CARD FEE,PROCESSING FEE,,WINRED
3,CAMPAIGN OFFICE EXPENSE,PRINTER CARTRIDGE,,OFFICE DEPOT
4,"MAILING,POSTAGE,BULK RATE",POST CARDS,,NEXT DAY FLYERS


In [3]:
concat_df = duckdb.sql("SELECT LOWER(CONCAT(expenditure_description, ' ', expenditure_purpose)) AS concat_expenditure FROM '../data/staging/stg_expenditures.parquet'").df()
concat_df.sample(25)

Unnamed: 0,concat_expenditure
81709,print advertising slate ad-allocated-16 candid...
44678,direct contributions contribution
117087,direct contributions contribution
117054,ngp
49206,2022 contribution
20119,service fee
6202,print advertising fb ad boost
63891,computer costs advertising
88416,donation
66284,"mailing,postage,bulk rate postage/8th cd rlm news"


In [19]:
import transformers

data = {
    "credit card and merchant fees": [
        "credit card fee credit card processing",
        "bank charges fee",
        "merchant account fees"
    ],
    "consulting, staff, and payroll": [
        "communications consulting",
        "payroll",
        "staff payment",
        "consultation, research consulting",
        "salaries & wages expenditure",
        "legal services"
    ],
    "paid media and advertising": [
        "media advertising",
        "postage",
        "print advertising mailing"
    ],
    "refunds": [
        "refund of contribution",
        "contribution refund"
    ],
    "other": [
        "non-profit donation",
        "food expenses"
    ]               
}

def run_prediction(text, classifier, labels):
    prediction = classifier(text, labels)
    prob = max(prediction['scores'])
    return {
        "predicted_label": prediction['labels'][prediction['scores'].index(prob)],
        "predicted_prob": prob
    }

text = "consultation, research consulting"
classifier = transformers.pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["paid media", "payroll", "food", "contribution", "consulting", "bank or credit card fees", "other campaign activities"]

prediction = run_prediction(text, classifier, labels)
prediction



{'predicted_label': 'consulting', 'predicted_prob': 0.8243721723556519}

In [20]:
sample_df = concat_df.sample(50)
sample_df['predicted_expenditure_label'] = sample_df['concat_expenditure'].apply(lambda x: run_prediction(x, classifier, labels))
sample_df.head()

Unnamed: 0,concat_expenditure,predicted_expenditure_label
11435,bank charges mi032422pp - pay pal fees,{'predicted_label': 'bank or credit card fees'...
124826,direct contribution,"{'predicted_label': 'contribution', 'predicted..."
50897,direct contributions direct contribution,"{'predicted_label': 'contribution', 'predicted..."
41575,payroll,"{'predicted_label': 'payroll', 'predicted_prob..."
27261,computer costs website design,"{'predicted_label': 'paid media', 'predicted_p..."


In [21]:
sample_df['predicted_expenditure_label'].head(5).values

array([{'predicted_label': 'bank or credit card fees', 'predicted_prob': 0.55501788854599},
       {'predicted_label': 'contribution', 'predicted_prob': 0.8688294291496277},
       {'predicted_label': 'contribution', 'predicted_prob': 0.9573243856430054},
       {'predicted_label': 'payroll', 'predicted_prob': 0.6385149955749512},
       {'predicted_label': 'paid media', 'predicted_prob': 0.571456253528595}],
      dtype=object)

In [32]:
import cloudpickle

out_dict = {
    'run_prediction': run_prediction,
    'classifier': classifier,
    'labels': labels
}

with open('../data/ml_models/zero_shot_model.pkl', 'wb') as f:
    cloudpickle.dump(out_dict, f)