# Adding keyword labels to O&M data
This notebook demonstrates the use of the `pvops.classify.get_attributes_from_keywords` module for adding asset labels based off O&M notes.

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

from pvops.text import utils, preprocess
from pvops.text.classify import get_attributes_from_keywords
from pvops.text.visualize import visualize_classification_confusion_matrix

# Step 0: Get sample data, remap assets

In [None]:
# pull in sample data and remap assets for ease of comparison

om_df = pd.read_csv('example_data/example_ML_ticket_data.csv')
col_dict = {
    "data" : "CompletionDesc",
    "eventstart" : "Date_EventStart",
    "save_data_column" : "processed_data",
    "save_date_column" : "processed_date",
    "attribute_col" : "Asset",
    "predicted_col" : "Keyword_Asset",
    "remapping_col_from": "in",
    "remapping_col_to": "out_"
}

# remap assets
remapping_df = pd.read_csv('example_data/remappings_asset.csv')
remapping_df['out_'] = remapping_df['out_'].replace({'met station': 'met',
                                                     'energy storage': 'battery',
                                                     'energy meter': 'meter'})
om_df = utils.remap_attributes(om_df, remapping_df, col_dict, allow_missing_mappings=True)
om_df.head()

# Step 1: Text preprocessing

In [None]:
# preprocessing steps
om_df[col_dict['attribute_col']] = om_df.apply(lambda row: row[col_dict['attribute_col']].lower(), axis=1)
om_df = preprocess.preprocessor(om_df, lst_stopwords=[], col_dict=col_dict, print_info=False, extract_dates_only=False)

DATA_COL = col_dict['data']
om_df[DATA_COL] = om_df['processed_data']

# replace terms
equipment_df = pd.read_csv('~/pvOps/examples/example_data/mappings_equipment.csv')
pv_terms_df = pd.read_csv('~/pvOps/examples/example_data/mappings_pv_terms.csv')
pv_reference_df = pd.concat([equipment_df, pv_terms_df])
om_df = utils.remap_words_in_text(om_df=om_df, remapping_df=pv_reference_df, remapping_col_dict=col_dict)

om_df.head()

# Step 2: Search for keywords to use as labels

In [None]:
# add asset labels from keyword reference dict
om_df = get_attributes_from_keywords(om_df=om_df,
                                     col_dict=col_dict,
                                     reference_df=equipment_df)
om_df.head()

# Step 3: Metrics

In [None]:
# get accuracy measures and count metrics
PREDICT_COL = col_dict['predicted_col']
LABEL_COL = col_dict['attribute_col']

# entries with some keyword over interest, over all entries
label_count = om_df[PREDICT_COL].count() / len(om_df)

# replace 'Other' values with 'Unknown'
om_df[LABEL_COL] = om_df[LABEL_COL].replace('other', 'unknown')
# replace NaN values to use accuracy score
om_df[[LABEL_COL, PREDICT_COL]] = om_df[[LABEL_COL, PREDICT_COL]].fillna('unknown')
acc_score = accuracy_score(y_true=om_df[LABEL_COL], y_pred=om_df[PREDICT_COL])

msg = f'{label_count:.2%} of entries had a keyword of interest, with {acc_score:.2%} accuracy.'
print(msg)

# Step 4: Visualization

In [None]:
# plot confusion matrix
title = 'Confusion Matrix of Actual and Predicted Asset Labels'
visualize_classification_confusion_matrix(om_df, col_dict, title)