In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# retrieve BOW vocab using usual sklearn fit method
texts = ["hello", "world", "this", "is", "me"]
cv = CountVectorizer()

cv.fit(texts)
cv.vocabulary_
# {'hello': 0, 'world': 4, 'this': 3, 'is': 1, 'me': 2}


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# explicitly define vocabulary at point of instantiation
texts = ["hello", "world", "this", "is", "me"]
cv = CountVectorizer(vocabulary=texts)  # err


In [None]:
cv.vocabulary_
# ?cv.set_params


In [None]:
import pandas as pd

query = """(super AND amg super) OR (super AND australian catholic super) OR (super AND australiansuper) OR (super AND aware super) OR (super AND cbus) OR (super AND caresuper) OR (super AND colonial first estate) OR (super AND eiss super) OR (super AND energysuper) OR (super AND equipsuper) OR (super AND firststate super growth) OR (super AND future super) OR (super AND guild super) OR (super AND hesta super fund) OR (super AND hostplus) OR (super AND intrust) OR (super AND lgiasuper)                  20
(banking AND suncorp) OR (banking AND teachers mutual bank) OR (banking AND westpac)"""


df = (
    pd.read_csv("/Users/samhardy/Desktop/data/2021_reddit_all_industries.csv")
    .query('subreddit == "AusFinance"')
    .pipe(lambda x: x[x["query"].str.contains("(super AND amg super)")])
    .pipe(lambda x: x[~x.body.isna()])
    .drop_duplicates("body")
    .pipe(lambda x: x[x.body.apply(lambda y: len(y) <= 150)])
    .pipe(lambda x: x[x.body.apply(lambda y: len(y) >= 50)])
    .sample(n=200, random_state=42)
    .reset_index(drop=True)
)


In [None]:
df.to_csv("/Users/samhardy/Desktop/data/2021_reddit_filtered.csv", index=False)


In [None]:
super_dict = {
    "regulation": ["asic", "government", "federal", "tax"],
    "contribution": [
        "contribution",
        "concession",
        "personal",
        "after tax",
        "10%",
        "10.5%",
    ],
    "covid": ["covid", "lockdown", "downturn", "effect"],
    "retirement": ["retire", "house", "annuity", "age"],
    "fund": [
        "unisuper",
        "aus super",
        "australian super",
        "sun super",
        "qsuper",
        "rest",
        "cbus",
    ],
}


In [None]:
res = {
    "regulation": 0.0878,
    "contribution": 0.6488,
    "covid": 0.0878,
    "retirement": 0.0878,
    "fund": 0.0878,
}

sum(res.values())


In [None]:
from clear_bow.classifier import DictionaryClassifier

super_dict = {
    "regulation": ["asic", "government", "federal", "tax"],
    "contribution": [
        "contribution",
        "concession",
        "personal",
        "after tax",
        "10%",
        "10.5%",
    ],
    "covid": ["covid", "lockdown", "downturn", "effect"],
    "retirement": ["retire", "house", "annuity", "age"],
    "fund": [
        "unisuper",
        "aus super",
        "australian super",
        "sun super",
        "qsuper",
        "rest",
        "cbus",
    ],
}
dc = DictionaryClassifier(label_dictionary=super_dict)  # multi-class by default
dc.predict_single("A 10% contribution is not enough for a well balanced super fund!")
# # {'regulation': 0.0878,
# #  'contribution': 0.6488,
# #  'covid': 0.0878,
# #  'retirement': 0.0878,
# #  'fund': 0.0878}


# dc.predict_batch(
#     [
#         "A 10% contribution is not enough for a well balanced super fund!",
#         "Australian government should stay the hell out of my retirement plans!",
#     ]
# )
# # [{'regulation': 0.0878,
# #   'contribution': 0.6488,
# #   'covid': 0.0878,
# #   'retirement': 0.0878,
# #   'fund': 0.0878},
# #  {'regulation': 0.3222,
# #   'contribution': 0.1185,
# #   'covid': 0.1185,
# #   'retirement': 0.3222,
# #   'fund': 0.1185}]


In [None]:
# serialise
dc.to_disk("/Users/samhardy/Desktop/dict_classifier")

dc = DictionaryClassifier("/Users/samhardy/Desktop/dict_classifier")


In [None]:
dc._get_label_word_count(
    "A 10% contribution is not enough for a well balanced super fund!"
)


In [None]:
import numpy as np

list(
    {
        "regulation": 0,
        "contribution": 2,
        "covid": 0,
        "retirement": 0,
        "fund": 0,
    }.values()
)


In [None]:
preds = pd.DataFrame(list(df.body.apply(dc.predict_single)))

for e in super_dict.keys():
    mask = preds[e] > 0.3
    df[mask].head(3).body.tolist()


In [None]:
x = [
    [
        "And before Asic politely reminded them they cannot provide financial advice",
        "Taxes are not fees. They are a federal government tax.",
        "That's some other government's problem.",
    ],
    [
        "Mate you get 15% tax on concessional contributions in super.\n\nSo no surprise it adds up to over 15%.",
        "Personal insurance and health insurance cover different things too.",
        "Is the ‘income’ figure for this before or after tax?",
    ],
    [
        "Message from Covid 19: “don’t touch your face, don’t touch your super”",
        "It would be better than having an imaginary jetski that I can't use after the lockdown",
    ],
    [
        "Not really unless you will retire fairly soon",
        "If you need a house, you need a house. I would suggest trying to top it back up when you can.",
        "Message from Covid 19: “don’t touch your face, don’t touch your super”",
    ],
    [
        "I'm not a Financial expert but Australian super are one of the bigger better options I guess",
        "No but considering moving I’m with unisuper at the moment their a closed fund",
        "Sun Super all the way",
    ],
]


example_docs = [item for sublist in x for item in sublist]


In [None]:
import random

random.shuffle(example_docs)

example_docs


In [None]:
def _transform_predict_dict(self, pred_dict):
    # if all word counts are 0
    if all(x == 0 for x in pred_dict.values()):
        prob_dict = {k: 0.0 for k in pred_dict.keys()}
        prob_dict["no_label"] = 1.0
        return prob_dict

    elif self.classifier_type == "multi_class":
        return dict(
            zip(pred_dict.keys(), self._softmax_array(list(pred_dict.values())))
        )

    elif self.classifier_type == "multi_label":
        return dict(
            zip(pred_dict.keys(), self._sigmoid_array(list(pred_dict.values())))
        )
