In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_selection, model_selection
import joblib
import nltk
import re

In [2]:
vectorizer = joblib.load('tfidf_vectorizer.joblib')

In [3]:
df = pd.read_csv(r'soc_202311261432.csv')
df = df.fillna('N/A')

In [4]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [5]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [6]:
df['text_clean'] = df['JOB_DUTIES'].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text_clean'], df['SOC_CODE'], test_size=0.2, random_state=42)

In [8]:
X_train_vectorized = vectorizer.transform(X_train)

In [9]:
X_names = vectorizer.get_feature_names_out()
p_value_limit = 0.99
df_features = pd.DataFrame()
for cat in np.unique(y_train):
    chi2, p = feature_selection.chi2(X_train_vectorized, y_train==cat)
    df_features = pd.concat([df_features, pd.DataFrame({'feature':X_names, 'score':1-p, 'y':cat})])
    df_features = df_features.sort_values(['y','score'], ascending=[True,False])
    df_features = df_features[(df_features['score']>p_value_limit) & (df_features['feature'].str.len() > 10)]
    

X_names = df_features['feature'].unique().tolist()

In [12]:
for cat in np.unique(y_train)[179:185]:
   print('# {}:'.format(cat))
   print('{}:'.format(df[df['SOC_CODE'] == cat].iloc[0]['SOC_TITLE']))
   print('  . selected features:',
         len(df_features[df_features['y']==cat]))
   print('  . top features:', ','.join(
df_features[df_features['y']==cat]['feature'].values[:10]))
   print(' ')

# 37-3011.00:
Landscaping and Groundskeeping Workers:
  . selected features: 2461
  . top features: 373011 onetonlineorg,able perform,according instruction,aerating weeding,aerator trimmer,apply mulch,area employment,area jobsites,asked demonstrate,assist sprinkler
 
# 37-3012.00:
Pesticide Handlers, Sprayers, and Applicators, Vegetation:
  . selected features: 46
  . top features: according formula,application,area assist,assist cleaning,backpack sprayer,designated area,herbicide application,job training,spray spread,sprayer spreader
 
# 37-3013.00:
Tree Trimmers and Pruners:
  . selected features: 75
  . top features: appearance health,branch tree,climbing rigging,clipper power,dead excess,excess branch,gain access,ground person,ground tree,hand pruner
 
# 37-3019.00:
Grounds Maintenance Workers, All Other:
  . selected features: 33
  . top features: andor power,care maintenance,continuously,debris ensure,ensure work,general landscaping,landscaping tree,maintenance service,perform wi

In [35]:
# !pip install shap

In [36]:
# classifier = joblib.load('random_forest_model.joblib')

In [37]:
# import shap

# # Get a sample instance for explanation
# text = "Reline or repair interior of refractory vessels with refractory clay and other refractory material. Chip slag from linings of ladles or remove linings when beyond repair, using hammers and chisels and other equipment. Mix specified amounts of sand, clay, mortar powder, and water to form refractory clay or mortar, using shovels or mixing machines. Tighten locknuts holding refractory stopper assemblies together, spread mortar on jackets to seal sleeve joints, and dry mortar. Remove worn or damaged block refractory linings of furnaces, kilns, and cyclones using hand tools and other equipment. Climb scaffolding, carrying hoses, and spray surfaces with refractory mixtures, using spray equipment. Construct and installation of refractory material laying by hand and spraying equipment. May perform other duties and tasks per SOC Code 49-9045."
# text_vectorized = vectorizer.transform([text])

# # Initialize the SHAP explainer
# explainer = shap.Explainer(classifier)

# # Get SHAP values for the sample instance
# shap_values = explainer.shap_values(text_vectorized)

# # Get feature names
# feature_names = vectorizer.get_feature_names_out()

# # Identify important features based on SHAP values
# important_features = [feature_names[i] for i in range(len(shap_values[0])) if abs(shap_values[0][i]) > 0.95]

# # Print the predicted class and important features
# predicted_class = classifier.predict(text_vectorized)[0]
# print(f"Predicted Class: {predicted_class}")
# print("Important Features:", important_features)
