In [2]:
import pandas as pd

In [3]:
replacements = {'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'IT': 'IT',
               'Engineering': 'Engineering',
               'Purchasing': 'Procurement',
               'Legal': 'Risk/Legal/Compliance',
               'Finance': 'Non-ICP',
               'Marketing': 'Non-ICP',
               'Sales': 'Non-ICP',
               'Unknown': 'Non-ICP',
               'Facilities': 'Non-ICP',
               'Human Resource': 'Non-ICP',
               'Management': 'Non-ICP',
               'Services': 'Non-ICP',
               'Operations': 'Non-ICP',
               'Administration': 'Non-ICP',
               'Corporate': 'Non-ICP',
               'Support': 'Non-ICP',
               'Education': 'Non-ICP',
               'Public Sector': 'Non-ICP',
               'Procurement': 'Procurement',
               'Medical': 'Non-ICP',
               'Other': 'Non-ICP',
               'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'Information Security': 'IT',
               'IT - Security': 'IT',
               'Help Desk / Desktop Services': 'IT',
               'Information Technology': 'IT',
               'Infrastructure': 'Non-ICP',
               'Customer Service / Support': 'Non-ICP',
               'Emerging Technology / Innovation': 'IT'}

In [4]:
# Create dictionary for Job Role within IT function
replacements_role = {'Information Security': 'Information Security',
               'information security': 'Information Security',
               'Networking': 'Networking',
               'IT General': 'IT General',
               'None Technical': 'IT General',
               'Help Desk': 'IT General',
               'Governance Risk Compliance': 'IT General',
               'Program Management': 'IT General',
               'Data': 'IT General',
               'IT Facilities': 'IT General',
               'Operations': 'IT General',
               'Communications': 'IT General',
               'Integration': 'IT General',
               'Vendor Management': 'IT General',
               'Training': 'IT General',
               'Business Continuity': 'IT General',
               'Other': 'IT General',
               'Development': 'Development',
                'Security': 'Information Security',
                    'Business Systems': 'Systems'}

In [5]:
# Create dictionary for Job Level within all ICP functions
replacements_level = {'Manager': 'Manager',
                     'Contributor': 'Contributor',
                      'contributor': 'Contributor',
                     'Director': 'Director',
                     'C-Level': 'C-level',
                      'C-level': 'C-level',
                     'Executive': 'Executive',
                     'Unknown': pd.NA,
                      'Non-Manager': 'Contributor',
                      'VP-level': 'Executive',
                      'VP-Level': 'Executive',
                      'Decision maker': 'Manager',
                      'Team Lead': 'Manager',
                      'VP/Director': 'Director',
                      'Engineer/Admin': 'Contributor',
                      'CxO': 'C-level',
                      'VP': 'Executive',
                      'Director / C-Level': 'C-level',
                      'Individual Contributor': 'Contributor',
                      'Director Level': 'Director',
                      'contribtuor': 'Contributor',
                      'Management': 'Manager',
                      'Director of Enterprise Cloud Business': 'Director',
                      'Admin': 'Contributor'
                     }

In [6]:
def replace_role(row):
    if pd.isnull(row['Job Function']): 
        return 'N/A'
    elif 'IT' not in row['Job Function']:
        return 'N/A'
    else:
        return row['Job Role']

In [24]:
df = pd.read_csv('df_filtered.csv')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df['Job Function'] = df['Job Function'].replace(replacements)
df['Job Function'] = df['Job Function'].fillna('Non-ICP')
df['Job Role'] = df.apply(replace_role, axis=1)
df['Job Level'] = df['Job Level'].replace(replacements_level)
df.loc[df['Job Function']=='Non-ICP', 'Job Level'] = pd.NA
df['Job Level'] = df['Job Level'].fillna(pd.NA)
df = df.dropna(subset=['Job Level'])



In [33]:
# Remove any stop words from the titles
from gensim.parsing.preprocessing import remove_stopwords
df['Title'] = df['Title'].apply(remove_stopwords)

In [34]:
import gensim 
#Some more preprocessing that will remove punctuations. 
tokenised_titles = df['Title'].apply(gensim.utils.simple_preprocess)

In [35]:
tokenised_titles.head()

0            [manager, cybersecurity]
1    [manager, information, security]
2         [user, experience, analyst]
3               [network, specialist]
4     [director, privacy, compliance]
Name: Title, dtype: object

In [36]:
#add the tokens as a column in df
df.insert(1, 'Title_Tokens', tokenised_titles)
df.head()

Unnamed: 0,Title,Title_Tokens,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]",,Engineering,Contributor
3,network specialist,"[network, specialist]",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]",Information Security,IT,Director


In [37]:
#initialize and train model to create word vectors
model = gensim.models.Word2Vec(window=2,min_count=1, workers=4)
model.build_vocab(df['Title_Tokens'])#this is a required step before training the model 
model.train(df['Title_Tokens'], total_examples=model.corpus_count, epochs=model.epochs) #default vector_size = 100


(3574035, 8613425)

In [38]:
model.wv.most_similar("manager")

[('director', 0.7396880984306335),
 ('leader', 0.7161996960639954),
 ('supervisor', 0.676173210144043),
 ('lead', 0.6538949608802795),
 ('manger', 0.6534761190414429),
 ('manage', 0.6509885191917419),
 ('coordinator', 0.6432304382324219),
 ('mgr', 0.6425690650939941),
 ('specialist', 0.6325701475143433),
 ('analyst', 0.6188406348228455)]

In [39]:
model.wv.most_similar("analyst")

[('specialist', 0.822262704372406),
 ('analysts', 0.7638948559761047),
 ('analystè', 0.7218160033226013),
 ('mgr', 0.661284863948822),
 ('enginner', 0.6546992659568787),
 ('manger', 0.6518502235412598),
 ('intern', 0.6489644050598145),
 ('engineer', 0.6476449370384216),
 ('technician', 0.646087110042572),
 ('magento', 0.6405235528945923)]

In [40]:
model.wv.most_similar("network")

[('noc', 0.6769733428955078),
 ('video', 0.6375817656517029),
 ('carrier', 0.6299111247062683),
 ('datacenter', 0.6148328185081482),
 ('ip', 0.6115643978118896),
 ('networks', 0.6115432381629944),
 ('construction', 0.6008782386779785),
 ('cyberdeense', 0.6006216406822205),
 ('maintenance', 0.5952282547950745),
 ('flight', 0.5790308117866516)]

In [41]:
import numpy as np

#below function creates a vector for a title by adding the vector of each word in it
def get_title_vec(title_tokens):
    if len(title_tokens) == 0:
        return np.zeros(100)

    # create a list of vectors of all the tokens in a title
    vectors = [ model.wv[token] for token in title_tokens]
    #sum all the vectors in the list
    return np.sum(vectors, axis=0)

title_vecs = df['Title_Tokens'].map(get_title_vec)
#Add title vecs as a column in df 
df.insert(2, 'Title_Vec', title_vecs)
df.head()



Unnamed: 0,Title,Title_Tokens,Title_Vec,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]","[0.16279401, 0.7852854, -0.57887846, -0.300722...",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]","[-0.58274347, 0.3675683, -0.20381153, -1.50329...",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]","[-0.35012373, 0.25453788, -0.5823362, -2.70408...",,Engineering,Contributor
3,network specialist,"[network, specialist]","[-1.201381, 0.30538723, -0.57608116, -0.110826...",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]","[-0.4904934, -0.32818222, -1.7037625, -1.49155...",Information Security,IT,Director


In [42]:
#create training and validation sets for FUNCTION, ROLE, LEVEL for training/crossvalidating classification models 
from sklearn.model_selection import train_test_split

trainX,validX,trainy,validy = train_test_split(df['Title_Vec'], df['Job Function'],train_size=0.80,random_state=123)

it_function = df[df['Job Function'].str.contains('IT')]
it_function['Job Role'] = it_function['Job Role'].replace(replacements_role)
it_function['Job Role'] = it_function['Job Role'].fillna('IT General')
trainX_role, validX_role, trainy_role, validy_role = train_test_split(it_function['Title_Vec'], it_function['Job Role'], train_size=0.80, random_state=123)
trainX_level, validX_level, trainy_level, validy_level = train_test_split(it_function['Title_Vec'], it_function['Job Level'], train_size=0.80, random_state=123)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [20]:
# Start with Bayes Naive Classifier to determine Job Function 
from sklearn.naive_bayes import GaussianNB as NBC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

nbc = NBC()
# Parameters to tune for NBC
nbc_params = {'var_smoothing': [0.1, 0.5, 1.0, 2.0]}

rand_search_nbc = RandomizedSearchCV(NBC(),nbc_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)

rand_search_nbc.fit(trainX.tolist(),trainy)
print(f"Classification Report with hold-out sample for best fit of NBC model:...\n")
print(classification_report(validy,rand_search_nbc.predict(validX.tolist())))
print("------------------------------------------------------")
print("Best estmator for NBC model for JOB FUNCTION: {}".format(rand_search_nbc.best_estimator_), )



Classification Report with hold-out sample for best fit of NBC model:...

                       precision    recall  f1-score   support

          Engineering       0.17      0.88      0.29      9123
                   IT       0.91      0.52      0.66     88874
              Non-ICP       0.47      0.23      0.31     11682
          Procurement       0.04      0.68      0.08       222
Risk/Legal/Compliance       0.02      0.47      0.04       176

             accuracy                           0.52    110077
            macro avg       0.33      0.56      0.28    110077
         weighted avg       0.80      0.52      0.59    110077

------------------------------------------------------
Best estmator for NBC model for JOB FUNCTION: GaussianNB(priors=None, var_smoothing=0.1)


In [49]:
rand_search_nbc_role = RandomizedSearchCV(NBC(),nbc_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_nbc_role.fit(trainX_role.tolist(),trainy_role)
print(f"  Report with hold-out sample for best fit of NBC model:...\n")
print(classification_report(validy_role,rand_search_nbc_role.predict(validX_role.tolist())))
print("------------------------------------------------------")
print("Best estmator for NBC model for JOB ROLE: {}".format(rand_search_nbc_role.best_estimator_), )



  Report with hold-out sample for best fit of NBC model:...

                      precision    recall  f1-score   support

         Development       0.15      0.28      0.19      1982
          IT General       0.38      0.15      0.21     14579
Information Security       0.75      0.47      0.58     39367
          Networking       0.49      0.83      0.61     29474
             Systems       0.23      0.36      0.28      1984

            accuracy                           0.53     87386
           macro avg       0.40      0.42      0.38     87386
        weighted avg       0.57      0.53      0.51     87386

------------------------------------------------------
Best estmator for NBC model for JOB ROLE: GaussianNB(priors=None, var_smoothing=0.1)


In [50]:
rand_search_nbc_level = RandomizedSearchCV(NBC(),nbc_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_nbc_level.fit(trainX_level.tolist(),trainy_level)
print(f"  Report with hold-out sample for best fit of NBC model:...\n")
print(classification_report(validy_level,rand_search_nbc_level.predict(validX_level.tolist())))
print("------------------------------------------------------")
print("Best estmator for NBC model for JOB LEVEL: {}".format(rand_search_nbc_level.best_estimator_), )



  Report with hold-out sample for best fit of NBC model:...

              precision    recall  f1-score   support

     C-level       0.71      0.82      0.76      9497
 Contributor       0.81      0.86      0.84     28226
    Director       0.78      0.85      0.82     18855
   Executive       0.54      0.41      0.46     11641
     Manager       0.87      0.78      0.82     19167

    accuracy                           0.78     87386
   macro avg       0.74      0.74      0.74     87386
weighted avg       0.77      0.78      0.77     87386

------------------------------------------------------
Best estmator for NBC model for JOB LEVEL: GaussianNB(priors=None, var_smoothing=0.1)


In [22]:
from sklearn.tree import DecisionTreeClassifier as DTC

dt_params = {'criterion':['gini'],
             'max_depth':[10, 20, 50],
             'min_samples_split':[2,5,10], # required to split
             'max_features':['sqrt','log2'],
             'random_state':[123]
            }

rand_search_DTC = RandomizedSearchCV(DTC(),dt_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_DTC.fit(trainX.tolist(),trainy)
print(f"Classification Report with hold-out sample for best fit of DTC model:...\n")
print(classification_report(validy,rand_search_DTC.predict(validX.tolist())))
print("------------------------------------------------------")
print("Best estmator for DTC model for JOB FUNCTION: {}".format( rand_search_DTC.best_estimator_), )
    

Classification Report with hold-out sample for best fit of DTC model:...

                       precision    recall  f1-score   support

          Engineering       0.87      0.92      0.89      9123
                   IT       0.97      0.97      0.97     88874
              Non-ICP       0.84      0.83      0.83     11682
          Procurement       0.63      0.66      0.64       222
Risk/Legal/Compliance       0.71      0.77      0.74       176

             accuracy                           0.95    110077
            macro avg       0.80      0.83      0.81    110077
         weighted avg       0.95      0.95      0.95    110077

------------------------------------------------------
Best estmator for DTC model for JOB FUNCTION: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=50, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min

In [55]:
rand_search_dtc_role = RandomizedSearchCV(DTC(),dt_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_dtc_role.fit(trainX_role.tolist(),trainy_role)
print(f"  Report with hold-out sample for best fit of DTC model:...\n")
print(classification_report(validy_role,rand_search_dtc_role.predict(validX_role.tolist())))
print("------------------------------------------------------")
print("Best estmator for DTC model for JOB ROLE: {}".format(rand_search_dtc_role.best_estimator_), )

  Report with hold-out sample for best fit of DTC model:...

                      precision    recall  f1-score   support

         Development       0.71      0.71      0.71      1982
          IT General       0.79      0.61      0.69     14579
Information Security       0.89      0.90      0.89     39367
          Networking       0.85      0.94      0.89     29474
             Systems       0.74      0.66      0.70      1984

            accuracy                           0.85     87386
           macro avg       0.80      0.76      0.78     87386
        weighted avg       0.85      0.85      0.85     87386

------------------------------------------------------
Best estmator for DTC model for JOB ROLE: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=50, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_sampl

In [56]:
rand_search_dtc_level = RandomizedSearchCV(DTC(),dt_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_dtc_level.fit(trainX_level.tolist(),trainy_level)
print(f"  Report with hold-out sample for best fit of DTC model:...\n")
print(classification_report(validy_level,rand_search_dtc_level.predict(validX_level.tolist())))
print("------------------------------------------------------")
print("Best estmator for DTC model for JOB LEVEL: {}".format(rand_search_dtc_level.best_estimator_), )



  Report with hold-out sample for best fit of DTC model:...

              precision    recall  f1-score   support

     C-level       0.93      0.93      0.93      9497
 Contributor       0.95      0.95      0.95     28226
    Director       0.95      0.95      0.95     18855
   Executive       0.92      0.92      0.92     11641
     Manager       0.94      0.93      0.94     19167

    accuracy                           0.94     87386
   macro avg       0.94      0.94      0.94     87386
weighted avg       0.94      0.94      0.94     87386

------------------------------------------------------
Best estmator for DTC model for JOB LEVEL: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=50, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presor

In [30]:
rf_params = {'n_estimators':[10,25],
             'criterion':['gini'],
             'max_depth':[10, 20, 50],
             'min_samples_split':[2,5,10], # required to split
             'max_features':['sqrt','log2'],
             'max_samples':[0.10,0.50],
             'random_state':[123]
            }

In [24]:
from sklearn.ensemble import RandomForestClassifier as RFC

rand_search_rfc_function = RandomizedSearchCV(RFC(),rf_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_rfc_function.fit(trainX.tolist(),trainy)
print(f"Classification Report with hold-out sample for best fit of rfc model:...\n")
print(classification_report(validy,rand_search_rfc_function.predict(validX.tolist())))
print("------------------------------------------------------")
print("Best estmator for RFC model for JOB FUNCTION: {}".format(rand_search_rfc_function.best_estimator_), )



Classification Report with hold-out sample for best fit of rfc model:...

                       precision    recall  f1-score   support

          Engineering       0.92      0.91      0.91      9123
                   IT       0.96      0.98      0.97     88874
              Non-ICP       0.90      0.79      0.84     11682
          Procurement       0.83      0.57      0.68       222
Risk/Legal/Compliance       0.97      0.71      0.82       176

             accuracy                           0.95    110077
            macro avg       0.92      0.79      0.84    110077
         weighted avg       0.95      0.95      0.95    110077

------------------------------------------------------
Best estmator for RFC model for JOB FUNCTION: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=0.5,
                       min_impurity_decrease=

In [25]:
rand_search_rfc_role = RandomizedSearchCV(RFC(),rf_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_rfc_role.fit(trainX_role.tolist(),trainy_role)
print(f"  Report with hold-out sample for best fit of RFC model:...\n")
print(classification_report(validy_role,rand_search_rfc_role.predict(validX_role.tolist())))
print("------------------------------------------------------")
print("Best estmator for RFC model for JOB ROLE: {}".format(rand_search_rfc_role.best_estimator_), )

  Report with hold-out sample for best fit of RFC model:...

                      precision    recall  f1-score   support

         Development       0.81      0.69      0.74      2067
          IT General       0.81      0.60      0.69     14611
Information Security       0.88      0.90      0.89     40182
          Networking       0.85      0.95      0.90     29988
             Systems       0.87      0.61      0.71      2019

            accuracy                           0.86     88867
           macro avg       0.84      0.75      0.79     88867
        weighted avg       0.86      0.86      0.85     88867

------------------------------------------------------
Best estmator for RFC model for JOB ROLE: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=0.5,
                       min_impurity_decrease=0.0, min_impurity_split=No

In [44]:

rand_search_rfc_level = RandomizedSearchCV(RFC(),rf_params, # positional arguments, model and parameter grid
                                    n_iter=10,
                                    scoring='f1_macro', 
                                    cv=5,
                                    random_state=123,
                                    n_jobs=-1)
    
rand_search_rfc_level.fit(trainX_level.tolist(),trainy_level)
print(f"  Report with hold-out sample for best fit of RFC model:...\n")
print(classification_report(validy_level,rand_search_rfc_level.predict(validX_level.tolist())))
print("------------------------------------------------------")
print("Best estmator for RFC model for JOB LEVEL: {}".format(rand_search_rfc_level.best_estimator_), )

  Report with hold-out sample for best fit of RFC model:...

              precision    recall  f1-score   support

     C-level       0.94      0.93      0.94      9497
 Contributor       0.96      0.96      0.96     28226
    Director       0.96      0.97      0.96     18855
   Executive       0.94      0.93      0.94     11641
     Manager       0.95      0.96      0.96     19167

    accuracy                           0.95     87386
   macro avg       0.95      0.95      0.95     87386
weighted avg       0.95      0.95      0.95     87386

------------------------------------------------------
Best estmator for RFC model for JOB LEVEL: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='log2',
                       max_leaf_nodes=None, max_samples=0.5,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
  