In [1]:
#!pip install pandas
#!pip install nltk
# !pip install langdetect
# !pip install scikit-learn gensim
import pandas as pd


In [2]:
df = pd.read_csv('Historical Lead Records.csv', encoding='latin1')

df.head()

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,Manager-Cybersecurity,Information Security,IT,Manager
1,"Manager, Information Security",Information Security,IT,Manager
2,User Experience Analyst,Development,Engineering,Contributor
3,Network Specialist,Networking,IT,Contributor
4,Director of Privacy and Compliance,Information Security,IT,Director


In [3]:
#make title column lowercase
df['Title'] = df['Title'].str.lower()

df.shape[0]

865671

In [4]:
#data relableing to most frequent F/R/L
def most_frequent(x):
    if not x.empty and len(x.value_counts()) > 0:
        return x.value_counts().index[0]
    else:
        return None

grouped = df.groupby('Title').agg(most_frequent)
grouped.reset_index(inplace=True)

merged_df = pd.merge(df, grouped, on='Title', suffixes=('', '_most_frequent'))

merged_df.drop(['Job Role', 'Job Function', 'Job Level'], axis=1, inplace=True)

merged_df.rename(columns={'Job Role_most_frequent': 'Job Role', 'Job Function_most_frequent': 'Job Function', 'Job Level_most_frequent': 'Job Level'}, inplace=True)


In [5]:
na_counts = df.isna().sum()
print(na_counts)

Title           12690
Job Role         4502
Job Function     5083
Job Level       24546
dtype: int64


In [6]:
#1616 rows are completely empty
before_dropping = df.shape[0]
empty_rows= df.isna().all(axis=1).sum()
df = df.dropna(how='all')
after_dropping = df.shape[0]


print (before_dropping)
print (empty_rows)
print (after_dropping)

865671
1616
864055


In [7]:
question_mark = df['Title'].str.contains('\?').sum()
question_mark

148

In [8]:
# #replacing all question marks with empty strings
df['Title'] = df['Title'].str.replace('?', '')
question_mark1 = df['Title'].str.contains('\?').sum()
question_mark1

0

In [9]:
df = df.dropna(subset=['Title'])
df.shape[0]

852981

In [12]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from joblib import Parallel, delayed
import numpy as np

def is_english(text):
    try:
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False

def filter_english_titles(df):
    return df[df['Title'].apply(is_english)]

num_jobs = 4
chunks = np.array_split(df, num_jobs)
df_filtered_chunks = Parallel(n_jobs=num_jobs)(
    delayed(filter_english_titles)(chunk) for chunk in chunks
)
df_filtered = pd.concat(df_filtered_chunks)



In [4]:
df_filtered.shape[0]

550384

In [5]:
df_filtered.head()

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,manager-cybersecurity,Information Security,IT,Manager
1,"manager, information security",Information Security,IT,Manager
2,user experience analyst,Development,Engineering,Contributor
3,network specialist,Networking,IT,Contributor
4,director of privacy and compliance,Information Security,IT,Director


In [14]:
replacements = {'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'IT': 'IT',
               'Engineering': 'Engineering',
               'Purchasing': 'Procurement',
               'Legal': 'Risk/Legal/Compliance',
               'Finance': 'Non-ICP',
               'Marketing': 'Non-ICP',
               'Sales': 'Non-ICP',
               'Unknown': 'Non-ICP',
               'Facilities': 'Non-ICP',
               'Human Resource': 'Non-ICP',
               'Management': 'Non-ICP',
               'Services': 'Non-ICP',
               'Operations': 'Non-ICP',
               'Administration': 'Non-ICP',
               'Corporate': 'Non-ICP',
               'Support': 'Non-ICP',
               'Education': 'Non-ICP',
               'Public Sector': 'Non-ICP',
               'Procurement': 'Procurement',
               'Medical': 'Non-ICP',
               'Other': 'Non-ICP',
               'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'Information Security': 'IT',
               'IT - Security': 'IT',
               'Help Desk / Desktop Services': 'IT',
               'Information Technology': 'IT',
               'Infrastructure': 'Non-ICP',
               'Customer Service / Support': 'Non-ICP',
               'Emerging Technology / Innovation': 'IT'}

In [15]:
# Create dictionary for Job Role within IT function
replacements_role = {'Information Security': 'Information Security',
               'information security': 'Information Security',
               'Networking': 'Networking',
               'IT General': 'IT General',
               'None Technical': 'IT General',
               'Help Desk': 'IT General',
               'Governance Risk Compliance': 'IT General',
               'Program Management': 'IT General',
               'Data': 'IT General',
               'IT Facilities': 'IT General',
               'Operations': 'IT General',
               'Communications': 'IT General',
               'Integration': 'IT General',
               'Vendor Management': 'IT General',
               'Training': 'IT General',
               'Business Continuity': 'IT General',
               'Other': 'IT General',
               'Development': 'Development',
                'Security': 'Information Security',
                    'Business Systems': 'Systems'}

In [16]:
df_filtered['Job Function'] = df_filtered['Job Function'].replace(replacements)
df_filtered['Job Function'] = df_filtered['Job Function'].fillna('Non-ICP')

In [17]:
df_filtered['Job Function'].unique()

array(['IT', 'Engineering', 'Procurement', 'Risk/Legal/Compliance',
       'Non-ICP'], dtype=object)

In [18]:
def replace_role(row):
    if pd.isnull(row['Job Function']): 
        return 'N/A'
    elif 'IT' not in row['Job Function']:
        return 'N/A'
    else:
        return row['Job Role']

df_filtered['Job Role'] = df_filtered.apply(replace_role, axis=1)

In [19]:
#non_it = df_filtered[~df_filtered['Job Function'].str.contains('IT')]
#non_it.head()
df_filtered.head()

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,manager-cybersecurity,Information Security,IT,Manager
1,"manager, information security",Information Security,IT,Manager
2,user experience analyst,,Engineering,Contributor
3,network specialist,Networking,IT,Contributor
4,director of privacy and compliance,Information Security,IT,Director


In [20]:
# since the cleaning steps above take a while to run, output df_filtered to csv
df_filtered.to_csv('df_filtered.csv', index=False)
df_filtered = pd.read_csv('df_filtered.csv')

In [21]:
df = df_filtered.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [22]:
# Remove any stop words from the titles
from gensim.parsing.preprocessing import remove_stopwords
df['Title'] = df['Title'].apply(remove_stopwords)

In [23]:
import gensim 
#Some more preprocessing that will remove punctuations. 
tokenised_titles = df['Title'].apply(gensim.utils.simple_preprocess)

In [107]:
tokenised_titles.head()

0            [manager, cybersecurity]
1    [manager, information, security]
2         [user, experience, analyst]
3               [network, specialist]
4     [director, privacy, compliance]
Name: Title, dtype: object

In [108]:
#add the tokens as a column in df
df.insert(1, 'Title_Tokens', tokenised_titles)
df.head()

Unnamed: 0,Title,Title_Tokens,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]",,Engineering,Contributor
3,network specialist,"[network, specialist]",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]",Information Security,IT,Director


In [109]:
#initialize and train model
model = gensim.models.Word2Vec(window=2,min_count=1, workers=4)
model.build_vocab(df['Title_Tokens'])#this is a required step before training the model 
model.train(df['Title_Tokens'], total_examples=model.corpus_count, epochs=model.epochs) #default vector_size = 100


(4458227, 9898370)

In [110]:
model.wv.most_similar("manager")

[('director', 0.7358762621879578),
 ('mgr', 0.6793808341026306),
 ('leader', 0.674602746963501),
 ('supervisor', 0.6448173522949219),
 ('manger', 0.6439144015312195),
 ('specialist', 0.6277974247932434),
 ('analyst', 0.6192675828933716),
 ('lead', 0.612668514251709),
 ('head', 0.6092585921287537),
 ('coordinator', 0.588858962059021)]

In [111]:
model.wv.most_similar("analyst")

[('specialist', 0.7811944484710693),
 ('analysts', 0.7446349263191223),
 ('technician', 0.6567827463150024),
 ('intern', 0.6476014852523804),
 ('anaylst', 0.6301019191741943),
 ('engineer', 0.6207007169723511),
 ('manager', 0.6192675232887268),
 ('manger', 0.6185704469680786),
 ('sisso', 0.6107224225997925),
 ('suny', 0.6050832867622375)]

In [112]:
model.wv.most_similar("network")

[('noc', 0.6227026581764221),
 ('networks', 0.5699936151504517),
 ('ip', 0.5679224133491516),
 ('installation', 0.5656102895736694),
 ('grid', 0.5579527616500854),
 ('contracing', 0.5548722743988037),
 ('datacenter', 0.553804337978363),
 ('wintel', 0.5414943099021912),
 ('infrastracture', 0.5413138270378113),
 ('variance', 0.5400795340538025)]

In [113]:
import numpy as np

#below function creates a vector for a title by adding the vector of each word in it
def get_title_vec(title_tokens):
    # create a list of vectors of all the tokens in a title
    vectors = [ model.wv[token] for token in title_tokens]
    #sum all the vectors in the list
    return np.sum(vectors, axis=0)

title_vecs = df['Title_Tokens'].map(get_title_vec)
#Add title vecs as a column in df 
df.insert(2, 'Title_Vec', title_vecs)
df.head()

Unnamed: 0,Title,Title_Tokens,Title_Vec,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]","[-0.12915187, 1.4431016, -0.97067034, -0.44038...",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]","[-0.80308545, 3.709479, -1.2591271, -0.8867336...",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]","[0.08790761, 1.834218, -0.5568815, -3.3963838,...",,Engineering,Contributor
3,network specialist,"[network, specialist]","[-0.2251893, 3.0408993, 0.011231333, 0.5964745...",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]","[-0.9724124, 3.0888586, -1.6333616, 0.85276854...",Information Security,IT,Director


In [114]:
import nltk
#nltk.download('stopwords')

In [115]:
# First set up DTM
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stops = stopwords.words('english')

# Vectorize titles 
vec = CountVectorizer(token_pattern = r'\b[a-zA-Z_]{3,}[a-zA-Z]*\b', # DO NOT REMOVE THIS LINE!!!!!!!!!
                      max_df=0.5,
                      lowercase=True, 
                      stop_words=list(stops), 
                      max_features=1000, ngram_range=(1,2)) 
dtm = vec.fit_transform(df['Title'])

In [154]:
# Create DTM for titles that are in IT
it_function = df[df['Job Function'].str.contains('IT')]
it_function['Job Role'] = it_function['Job Role'].replace(replacements_role)
it_function['Job Role'] = it_function['Job Role'].fillna('IT General')

dtm_it = vec.fit_transform(it_function['Title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  it_function['Job Role'] = it_function['Job Role'].replace(replacements_role)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  it_function['Job Role'] = it_function['Job Role'].fillna('IT General')


In [156]:
# Create training and validation samples
from sklearn.model_selection import train_test_split

trainX,validX,trainy,validy = train_test_split(dtm, df['Job Function'],train_size=0.80,random_state=123)
trainX_role, validX_role, trainy_role, validy_role = train_test_split(dtm_it, it_function['Job Role'], train_size=0.80, random_state=123)

In [132]:
# Start with Bayes Naive Classifier to determine Job Function 
from sklearn.naive_bayes import GaussianNB as NBC
from sklearn.metrics import confusion_matrix, classification_report
nbc = NBC()

In [124]:
# Parameters to tune for NBC
nbc_params = {'var_smoothing': [0.1, 0.5, 1.0, 2.0]}


In [160]:
from sklearn.model_selection import RandomizedSearchCV
final_mods = []
for mod,params in zip([NBC()],[nbc_params]): # zip creates a list of tuples we can use to iterate
    rand_search = RandomizedSearchCV(mod,params, # positional arguments, model and parameter grid
                                     n_iter=50,
                                     scoring='f1_macro', 
                                     cv=5,
                                     random_state=123,
                                     n_jobs=-1)
    
    rand_search.fit(trainX.toarray(),trainy)
    print(f"Classification Report with hold-out sample for best fit of this model:...\n")
    print(type(mod))
    print(classification_report(validy,rand_search.predict(validX.toarray())))
    print("------------------------------------------------------")
    final_mods.append(rand_search)

1 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\tools\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\tools\Anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\tools\Anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 263, in fit
    return self._partial_fit(
           ^^^^^^^^^^^^^^^^^^
  File "C:\tools\Anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 489, in _partial_

Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.naive_bayes.GaussianNB'>
                       precision    recall  f1-score   support

          Engineering       0.64      0.82      0.72      9123
                   IT       0.89      0.95      0.92     88874
              Non-ICP       0.84      0.20      0.32     11682
          Procurement       0.60      0.17      0.26       222
Risk/Legal/Compliance       0.73      0.34      0.47       176

             accuracy                           0.86    110077
            macro avg       0.74      0.50      0.54    110077
         weighted avg       0.86      0.86      0.84    110077

------------------------------------------------------


In [161]:
for mod,params in zip([NBC()],[nbc_params]): # zip creates a list of tuples we can use to iterate
    rand_search = RandomizedSearchCV(mod,params, # positional arguments, model and parameter grid
                                     n_iter=50,
                                     scoring='f1_macro', 
                                     cv=5,
                                     random_state=123,
                                     n_jobs=-1)
    
    rand_search.fit(trainX_role.toarray(),trainy_role)
    print(f"Classification Report with hold-out sample for best fit of this model:...\n")
    print(type(mod))
    print(classification_report(validy_role,rand_search.predict(validX_role.toarray())))
    print("------------------------------------------------------")
    final_mods.append(rand_search)



Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.naive_bayes.GaussianNB'>
                      precision    recall  f1-score   support

         Development       0.30      0.67      0.42      2067
          IT General       0.70      0.41      0.52     14611
Information Security       0.83      0.79      0.81     40182
          Networking       0.81      0.85      0.83     29988
             Systems       0.28      0.78      0.42      2019

            accuracy                           0.75     88867
           macro avg       0.58      0.70      0.60     88867
        weighted avg       0.77      0.75      0.75     88867

------------------------------------------------------


In [162]:
print(f"Best estimator for NBC for Job Function: {final_mods[0].best_estimator_} \n\n\n")
rand_search = final_mods[0].best_estimator_
print(f"Classification report: {classification_report(validy,rand_search.predict(validX.toarray()))}")
print("------------------------------------------------------")

print(f"Best estimator for NBC for Job Role for IT function only: {final_mods[1].best_estimator_} \n\n\n")
rand_search = final_mods[1].best_estimator_
print(f"Classification report: {classification_report(validy_role,rand_search.predict(validX_role.toarray()))}")
print("------------------------------------------------------")



Best estimator for NBC for Job Function: GaussianNB(var_smoothing=1.0) 



Classification report:                        precision    recall  f1-score   support

          Engineering       0.64      0.82      0.72      9123
                   IT       0.89      0.95      0.92     88874
              Non-ICP       0.84      0.20      0.32     11682
          Procurement       0.60      0.17      0.26       222
Risk/Legal/Compliance       0.73      0.34      0.47       176

             accuracy                           0.86    110077
            macro avg       0.74      0.50      0.54    110077
         weighted avg       0.86      0.86      0.84    110077

------------------------------------------------------
Best estimator for NBC for Job Role for IT function only: GaussianNB(var_smoothing=0.1) 



Classification report:                       precision    recall  f1-score   support

         Development       0.30      0.67      0.42      2067
          IT General       0.70      

In [None]:
#  From here - create two additional Naive Bayes Classifiers
# (1) Tune a Naive Bayes Classifier to identify Job Role for IT Function
# (2) Tune Naive Bayes Classifier to identify Job Level for IT and non-IT (but not non-ICP)


# Also worth checking performance (if time) - is it better if there is just one Naive Bayes Classifier for all job title
# to determine function, role, level? 

In [None]:
# Once final models are decided, need to create code to input CSV file with Record ID, Job Title
# Output will be Record ID, Job Title, Job Function, Job Role, Job Level  