In [8]:
# for reading in data
import sqlite3
import pandas as pd

# data preprocessing
from pvops.text.preprocess import text_remove_numbers_stopwords
from nltk import corpus
stopwords = corpus.stopwords.words('english')

# machine learning pipeline: vectorizer, classification models, and scoring
from pvops.text.classify import classification_deployer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer

#### Read in data

In [60]:
PATH_TO_PVROM = '../../../pvrom-dtb/PVROM/PVROM.db'
con = sqlite3.connect(PATH_TO_PVROM)
cur = con.cursor() #instantiate a cursor obj
cur.execute("SELECT name FROM sqlite_master WHERE type='table';") # get all tables in dtb
OM_df = pd.read_sql_query("SELECT * FROM OM", con)

#### Prepare data for supervised classification

In [61]:
# pull out the corrective maintenance records
CM_df = OM_df[~OM_df['WOType'].isnull()
              & OM_df['WOType'].str.contains('Corrective')]

# for training, keep only the rows where the asset and general description is listed
CM_nonnull = CM_df.dropna(subset=['Asset','GeneralDesc']).copy()

# clean the asset label, make it lowercase
CM_nonnull['AssetCleaned'] = CM_nonnull.apply(lambda x: text_remove_numbers_stopwords(x['Asset'], stopwords), axis='columns')
CM_nonnull['AssetCleaned'] = CM_nonnull['AssetCleaned'].str.lower()

# get the event description to be lowercase
CM_nonnull['GeneralDescLower'] = CM_nonnull['GeneralDesc'].str.lower()

# define our x as the general event description, and y as whether the asset is related to inverters
x = CM_nonnull['GeneralDescLower']
y = CM_nonnull['AssetCleaned'].str.contains('inv')

In [63]:
count_total = len(CM_df)
count_missing_asset = len(CM_df[CM_df['Asset'].isnull()])
print(f'{count_missing_asset}/{count_total} ({count_missing_asset/count_total*100 :.1f}%) of CM records are missing the asset')

3053/39805 (7.7%) of CM records are missing the asset


#### Fitting

In [64]:
# fit the tf-idf vectorizer on the whole corpus, including the rows w/o asset that we plan to gapfill using our model
all_descr = CM_df.dropna(subset='GeneralDesc')['GeneralDesc'].str.lower().astype('str')
vectorizer = TfidfVectorizer(min_df=1, stop_words=stopwords, ngram_range=(1,2), sublinear_tf=True)
vectorizer.fit(all_descr);

In [65]:
pipeline_steps = [('tfidf', vectorizer),
                  ('clf', None)]

classifiers = {'LinearSVC' : LinearSVC(),
               'LogisticRegression' : LogisticRegression(),
               'RandomForestClassifier' : RandomForestClassifier(),
               'MultinomialNB' : MultinomialNB()}

# pvops.text.defaults didn't have the hyperparameters I was looking for, so I made a custom definition
param_grid = {'LinearSVC' : {'clf__C': [0.01, 0.1, 1, 10, 100]},
              'LogisticRegression' : {'clf__C' : [0.01, 0.1, 1, 10, 100]},
              'RandomForestClassifier' : {'clf__n_estimators': [4, 16, 64],
                                          'clf__criterion': ['gini', 'entropy'],
                                          'clf__max_depth': [1, 2, 4]},
              'MultinomialNB' : {'clf__alpha' : [0.01, 0.1, 1, 10, 100]}}
param_grid = {key : {} for key in param_grid.keys()}

results = classification_deployer(x, y,
                                    n_splits=5,
                                    classifiers=classifiers,
                                    search_space=param_grid,
                                    pipeline_steps=pipeline_steps,
                                    scoring=make_scorer(accuracy_score))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Better score (0.888) found on classifier: LinearSVC
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Better score (0.890) found on classifier: LogisticRegression
Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 