In [1]:
#!pip install pandas
#!pip install nltk
# !pip install langdetect
# !pip install scikit-learn gensim
import pandas as pd


In [50]:
df = pd.read_csv('Historical Lead Records.csv', encoding='latin1')

df.head()

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,Manager-Cybersecurity,Information Security,IT,Manager
1,"Manager, Information Security",Information Security,IT,Manager
2,User Experience Analyst,Development,Engineering,Contributor
3,Network Specialist,Networking,IT,Contributor
4,Director of Privacy and Compliance,Information Security,IT,Director


In [51]:
#make title column lowercase
df['Title'] = df['Title'].str.lower()

In [52]:
#data relableing to most frequent F/R/L
def most_frequent(x):
    if not x.empty and len(x.value_counts()) > 0:
        return x.value_counts().index[0]
    else:
        return None

grouped = df.groupby('Title').agg(most_frequent)
grouped.reset_index(inplace=True)

merged_df = pd.merge(df, grouped, on='Title', suffixes=('', '_most_frequent'))

merged_df.drop(['Job Role', 'Job Function', 'Job Level'], axis=1, inplace=True)

merged_df.rename(columns={'Job Role_most_frequent': 'Job Role', 'Job Function_most_frequent': 'Job Function', 'Job Level_most_frequent': 'Job Level'}, inplace=True)


KeyboardInterrupt: 

In [6]:
#1616 rows are completely empty
before_dropping = df.shape[0]
empty_rows= df.isna().all(axis=1).sum()
df = df.dropna(how='all')
after_dropping = df.shape[0]


print (before_dropping)
print (empty_rows)
print (after_dropping)

865671
1616
864055


In [7]:
question_mark = df['Title'].str.contains('\?').sum()
question_mark

148

In [8]:
# #replacing all question marks with empty strings
df['Title'] = df['Title'].str.replace('?', '')
question_mark1 = df['Title'].str.contains('\?').sum()
question_mark1

0

In [9]:
df = df.dropna(subset=['Title'])
df.shape[0]

852981

In [12]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from joblib import Parallel, delayed
import numpy as np

def is_english(text):
    try:
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False

def filter_english_titles(df):
    return df[df['Title'].apply(is_english)]

num_jobs = 4
chunks = np.array_split(df, num_jobs)
df_filtered_chunks = Parallel(n_jobs=num_jobs)(
    delayed(filter_english_titles)(chunk) for chunk in chunks
)
df_filtered = pd.concat(df_filtered_chunks)



In [4]:
df_filtered.shape[0]

550384

In [2]:
# since the cleaning steps above take a while to run, output df_filtered to csv
#df_filtered.to_csv('df_filtered.csv', index=False)
df_filtered = pd.read_csv('df_filtered.csv')

In [4]:
replacements = {'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'IT': 'IT',
               'Engineering': 'Engineering',
               'Purchasing': 'Procurement',
               'Legal': 'Risk/Legal/Compliance',
               'Finance': 'Non-ICP',
               'Marketing': 'Non-ICP',
               'Sales': 'Non-ICP',
               'Unknown': 'Non-ICP',
               'Facilities': 'Non-ICP',
               'Human Resource': 'Non-ICP',
               'Management': 'Non-ICP',
               'Services': 'Non-ICP',
               'Operations': 'Non-ICP',
               'Administration': 'Non-ICP',
               'Corporate': 'Non-ICP',
               'Support': 'Non-ICP',
               'Education': 'Non-ICP',
               'Public Sector': 'Non-ICP',
               'Procurement': 'Procurement',
               'Medical': 'Non-ICP',
               'Other': 'Non-ICP',
               'IT Audit / IT Compliance': 'Risk/Legal/Compliance',
               'Information Security': 'IT',
               'IT - Security': 'IT',
               'Help Desk / Desktop Services': 'IT',
               'Information Technology': 'IT',
               'Infrastructure': 'Non-ICP',
               'Customer Service / Support': 'Non-ICP',
               'Emerging Technology / Innovation': 'IT'}

In [5]:
# Create dictionary for Job Role within IT function
replacements_role = {'Information Security': 'Information Security',
               'information security': 'Information Security',
               'Networking': 'Networking',
               'IT General': 'IT General',
               'None Technical': 'IT General',
               'Help Desk': 'IT General',
               'Governance Risk Compliance': 'IT General',
               'Program Management': 'IT General',
               'Data': 'IT General',
               'IT Facilities': 'IT General',
               'Operations': 'IT General',
               'Communications': 'IT General',
               'Integration': 'IT General',
               'Vendor Management': 'IT General',
               'Training': 'IT General',
               'Business Continuity': 'IT General',
               'Other': 'IT General',
               'Development': 'Development',
                'Security': 'Information Security',
                    'Business Systems': 'Systems'}

In [6]:
# Create dictionary for Job Level within all ICP functions
replacements_level = {'Manager': 'Manager',
                     'Contributor': 'Contributor',
                      'contributor': 'Contributor',
                     'Director': 'Director',
                     'C-Level': 'C-level',
                      'C-level': 'C-level',
                     'Executive': 'Executive',
                     'Unknown': pd.NA,
                      'Non-Manager': 'Contributor',
                      'VP-level': 'Executive',
                      'VP-Level': 'Executive',
                      'Decision maker': 'Manager',
                      'Team Lead': 'Manager',
                      'VP/Director': 'Director',
                      'Engineer/Admin': 'Contributor',
                      'CxO': 'C-level',
                      'VP': 'Executive',
                      'Director / C-Level': 'C-level',
                      'Individual Contributor': 'Contributor',
                      'Director Level': 'Director',
                      'contribtuor': 'Contributor',
                      'Management': 'Manager',
                      'Director of Enterprise Cloud Business': 'Director',
                      'Admin': 'Contributor'
                     }

In [7]:
df_filtered['Job Function'] = df_filtered['Job Function'].replace(replacements)
df_filtered['Job Function'] = df_filtered['Job Function'].fillna('Non-ICP')

In [36]:
df_filtered['Job Function'].unique()

array(['IT', 'Engineering', 'Procurement', 'Risk/Legal/Compliance',
       'Non-ICP'], dtype=object)

In [8]:
df_filtered['Job Level'] = df_filtered['Job Level'].replace(replacements_level)
df_filtered.loc[df_filtered['Job Function']=='Non-ICP', 'Job Level'] = pd.NA
df_filtered['Job Level'] = df_filtered['Job Level'].fillna(pd.NA)

In [9]:
def replace_role(row):
    if pd.isnull(row['Job Function']): 
        return 'N/A'
    elif 'IT' not in row['Job Function']:
        return 'N/A'
    else:
        return row['Job Role']

df_filtered['Job Role'] = df_filtered.apply(replace_role, axis=1)

In [11]:
#non_it = df_filtered[~df_filtered['Job Function'].str.contains('IT')]
#non_it.head()
df_filtered.head()

Unnamed: 0,Title,Job Role,Job Function,Job Level
0,manager-cybersecurity,Information Security,IT,Manager
1,"manager, information security",Information Security,IT,Manager
2,user experience analyst,,Engineering,Contributor
3,network specialist,Networking,IT,Contributor
4,director of privacy and compliance,Information Security,IT,Director


In [12]:
df_filtered['Job Level'].unique()

array(['Manager', 'Contributor', 'Director', 'C-level', 'Executive', <NA>],
      dtype=object)

In [10]:
df = df_filtered.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [14]:
# Remove any stop words from the titles
from gensim.parsing.preprocessing import remove_stopwords
df['Title'] = df['Title'].apply(remove_stopwords)

In [15]:
import gensim 
#Some more preprocessing that will remove punctuations. 
tokenised_titles = df['Title'].apply(gensim.utils.simple_preprocess)

In [16]:
#add the tokens as a column in df
df.insert(1, 'Title_Tokens', tokenised_titles)
df.head()

Unnamed: 0,Title,Title_Tokens,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]",,Engineering,Contributor
3,network specialist,"[network, specialist]",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]",Information Security,IT,Director


In [18]:
#initialize and train model
model = gensim.models.Word2Vec(window=2,min_count=1, workers=4)
model.build_vocab(df['Title_Tokens'])#this is a required step before training the model 
model.train(df['Title_Tokens'], total_examples=model.corpus_count, epochs=model.epochs) #default vector_size = 100


(4459757, 9898370)

In [73]:
model.wv.most_similar("manager")

[('director', 0.7374807000160217),
 ('leader', 0.6985123753547668),
 ('supervisor', 0.6763476133346558),
 ('mgr', 0.6627412438392639),
 ('manger', 0.6499350666999817),
 ('lead', 0.6399224996566772),
 ('specialist', 0.6316721439361572),
 ('head', 0.6070242524147034),
 ('analyst', 0.6062723994255066),
 ('coordinator', 0.6008464097976685)]

In [74]:
model.wv.most_similar("analyst")

[('analysts', 0.7701191306114197),
 ('specialist', 0.7637343406677246),
 ('intern', 0.6739217638969421),
 ('ofms', 0.6251246929168701),
 ('technician', 0.6185065507888794),
 ('enginner', 0.6131559014320374),
 ('spec', 0.6095673441886902),
 ('engineer', 0.6072471141815186),
 ('manager', 0.6062723398208618),
 ('manger', 0.5876338481903076)]

In [75]:
model.wv.most_similar("network")

[('noc', 0.6242579817771912),
 ('datacenter', 0.582085132598877),
 ('ip', 0.5743829011917114),
 ('networks', 0.5661808848381042),
 ('installation', 0.5608255863189697),
 ('operation', 0.5464958548545837),
 ('networking', 0.545594334602356),
 ('voice', 0.5450116395950317),
 ('carrier', 0.5387231707572937),
 ('grid', 0.5374865531921387)]

In [19]:
import numpy as np

#below function creates a vector for a title by adding the vector of each word in it
def get_title_vec(title_tokens):
    # create a list of vectors of all the tokens in a title
    vectors = [ model.wv[token] for token in title_tokens]
    #sum all the vectors in the list
    return np.sum(vectors, axis=0)

title_vecs = df['Title_Tokens'].map(get_title_vec)
#Add title vecs as a column in df 
df.insert(2, 'Title_Vec', title_vecs)
df.head()

Unnamed: 0,Title,Title_Tokens,Title_Vec,Job Role,Job Function,Job Level
0,manager-cybersecurity,"[manager, cybersecurity]","[-0.48963156, 1.7655406, -0.9755714, -1.016060...",Information Security,IT,Manager
1,"manager, information security","[manager, information, security]","[-0.39355576, 3.932649, -1.0444752, -1.3062502...",Information Security,IT,Manager
2,user experience analyst,"[user, experience, analyst]","[-0.9645748, 2.373046, -1.3231623, -2.7005827,...",,Engineering,Contributor
3,network specialist,"[network, specialist]","[-0.06601313, 2.3728542, -1.2169973, 0.4478888...",Networking,IT,Contributor
4,director privacy compliance,"[director, privacy, compliance]","[-0.16704276, 3.9424748, -0.60280246, -0.21131...",Information Security,IT,Director


In [11]:
import nltk
#nltk.download('stopwords')

In [12]:
# First set up DTM
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stops = stopwords.words('english')

# Vectorize titles 
vec = CountVectorizer(token_pattern = r'\b[a-zA-Z]{3,}[a-zA-Z]*\b',
                      max_df=0.5,
                      lowercase=True, 
                      stop_words=list(stops), 
                      max_features=1000, ngram_range=(1,2)) 
dtm = vec.fit_transform(df['Title'])

In [13]:
# Create DTM for titles that are in IT
it_function = df[df['Job Function'].str.contains('IT')]
it_function['Job Role'] = it_function['Job Role'].replace(replacements_role)
it_function['Job Role'] = it_function['Job Role'].fillna('IT General')

dtm_it = vec.fit_transform(it_function['Title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  it_function['Job Role'] = it_function['Job Role'].replace(replacements_role)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  it_function['Job Role'] = it_function['Job Role'].fillna('IT General')


In [14]:
# Create DTM for levels that are in ICP
icp_titles = df[df['Job Function']!='Non-ICP']
icp_titles = icp_titles.dropna(subset=['Job Level'])
dtm_icp = vec.fit_transform(icp_titles['Title'])

In [15]:
# Create training and validation samples
from sklearn.model_selection import train_test_split

trainX,validX,trainy,validy = train_test_split(dtm, df['Job Function'],train_size=0.80,random_state=123)
trainX_role, validX_role, trainy_role, validy_role = train_test_split(dtm_it, it_function['Job Role'], train_size=0.80, random_state=123)
trainX_level, validX_level, trainy_level, validy_level = train_test_split(dtm_icp, icp_titles['Job Level'], train_size=0.80, random_state=123)

In [16]:
# Start with Bayes Naive Classifier to determine Job Function 
from sklearn.naive_bayes import GaussianNB as NBC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix, classification_report
nbc = NBC()
rf = RFC()

In [19]:
# Parameters to tune for NBC
nbc_params = {'var_smoothing': [0.1, 0.5, 1.0, 2.0]}

rf_params = {'n_estimators':[10],
             'criterion':['gini','log_loss'],
             'max_depth':[None,10],
             'min_samples_split':[2,5], # required to split
             'max_features':[None,'sqrt','log2'],
             'max_samples':[None,0.10,0.75],
             'random_state':[123]
            }

from sklearn.model_selection import RandomizedSearchCV


In [19]:
################## JOB FUNCTION ######################
final_mods_function = []
for mod,params in zip([RFC()],[rf_params]):
#for mod,params in zip([NBC(), RFC()],[nbc_params, rf_params]): # zip creates a list of tuples we can use to iterate
    rand_search = RandomizedSearchCV(mod,params, # positional arguments, model and parameter grid
                                     n_iter=50,
                                     scoring='f1_macro', 
                                     cv=5,
                                     random_state=123,
                                     n_jobs=-1)
    
    rand_search.fit(trainX.toarray(),trainy)
    print(f"Classification Report with hold-out sample for best fit of this model:...\n")
    print(type(mod))
    print(classification_report(validy,rand_search.predict(validX.toarray())))
    print("------------------------------------------------------")
    final_mods_function.append(rand_search)

Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
                       precision    recall  f1-score   support

          Engineering       0.91      0.95      0.93      9123
                   IT       0.97      0.98      0.97     88874
              Non-ICP       0.89      0.80      0.84     11682
          Procurement       0.76      0.81      0.79       222
Risk/Legal/Compliance       0.96      0.75      0.84       176

             accuracy                           0.96    110077
            macro avg       0.90      0.86      0.88    110077
         weighted avg       0.96      0.96      0.96    110077

------------------------------------------------------


In [20]:
print(f"Best estimator for Random Forest for JOB FUNCTION: {final_mods_function[0].best_estimator_} \n\n\n")
rand_search_function = final_mods_function[0].best_estimator_

Best estimator for Random Forest for JOB FUNCTION: RandomForestClassifier(max_features='sqrt', n_estimators=10, random_state=123) 





In [21]:
import joblib
best_params_function = rand_search_function.best_params_
joblib.dump(best_params_function, 'best_model_job_function.pkl')

AttributeError: 'RandomForestClassifier' object has no attribute 'best_params_'

In [20]:
################## JOB ROLE FOR IT ONLY ######################
final_mods_role = []
for mod,params in zip([RFC()],[rf_params]):
#for mod,params in zip([NBC(), RFC()],[nbc_params, rf_params]): # zip creates a list of tuples we can use to iterate
    rand_search = RandomizedSearchCV(mod,params, # positional arguments, model and parameter grid
                                     n_iter=50,
                                     scoring='f1_macro', 
                                     cv=5,
                                     random_state=123,
                                     n_jobs=-1)
    
    rand_search.fit(trainX_role.toarray(),trainy_role)
    print(f"Classification Report with hold-out sample for best fit of this model:...\n")
    print(type(mod))
    print(classification_report(validy_role,rand_search.predict(validX_role.toarray())))
    print("------------------------------------------------------")
    final_mods_role.append(rand_search)

Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
                      precision    recall  f1-score   support

         Development       0.78      0.76      0.77      2067
          IT General       0.82      0.64      0.72     14611
Information Security       0.90      0.90      0.90     40182
          Networking       0.86      0.96      0.91     29988
             Systems       0.82      0.74      0.77      2019

            accuracy                           0.87     88867
           macro avg       0.84      0.80      0.81     88867
        weighted avg       0.87      0.87      0.87     88867

------------------------------------------------------


In [21]:
print(f"Best estimator for Random Forest for JOB ROLE FOR IT ONLY: {final_mods_role[0].best_estimator_} \n\n\n")
#rand_search_role = final_mods_role[0].best_estimator_
#best_params_role = rand_search_role.best_params_
#joblib.dump(best_params_role, 'best_model_job_role.pkl')

Best estimator for Random Forest for JOB ROLE FOR IT ONLY: RandomForestClassifier(max_features=None, n_estimators=10, random_state=123) 





In [22]:
final_mods_level = []
################## JOB LEVEL FOR ICP ONLY ######################
for mod,params in zip([NBC(), RFC()],[nbc_params, rf_params]): # zip creates a list of tuples we can use to iterate
    rand_search = RandomizedSearchCV(mod,params, # positional arguments, model and parameter grid
                                     n_iter=50,
                                     scoring='f1_macro', 
                                     cv=5,
                                     random_state=123,
                                     n_jobs=-1)
    
    rand_search.fit(trainX_level.toarray(),trainy_level)
    print(f"Classification Report with hold-out sample for best fit of this model:...\n")
    print(type(mod))
    print(classification_report(validy_level,rand_search.predict(validX_level.toarray())))
    print("------------------------------------------------------")
    final_mods_level.append(rand_search)



Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.naive_bayes.GaussianNB'>
              precision    recall  f1-score   support

     C-level       0.66      0.92      0.77      9596
 Contributor       0.85      0.91      0.88     34166
    Director       0.89      0.95      0.92     19362
   Executive       0.91      0.52      0.66     12198
     Manager       0.94      0.84      0.89     21266

    accuracy                           0.85     96588
   macro avg       0.85      0.83      0.82     96588
weighted avg       0.87      0.85      0.85     96588

------------------------------------------------------
Classification Report with hold-out sample for best fit of this model:...

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
              precision    recall  f1-score   support

     C-level       0.94      0.94      0.94      9596
 Contributor       0.95      0.96      0.95     34166
    Director       0.97      0.97      0.97 

In [23]:
print(f"Best estimator for NBC for JOB LEVEL FOR ICP ONLY: {final_mods_level[0].best_estimator_} \n\n\n")
print(f"Best estimator for Random Forest for JOB LEVEL FOR ICP ONLY: {final_mods_level[1].best_estimator_} \n\n\n")

#rand_search_level = final_mods_level[1].best_estimator_
#best_params_level = rand_search_level.best_params_
#joblib.dump(best_params_level, 'best_model_job_level.pkl')

Best estimator for NBC for JOB LEVEL FOR ICP ONLY: GaussianNB(var_smoothing=0.1) 



Best estimator for Random Forest for JOB LEVEL FOR ICP ONLY: RandomForestClassifier(max_features='sqrt', n_estimators=10, random_state=123) 





In [None]:
# Once final models are decided, need to create code to input CSV file with Record ID, Job Title
# Output will be Record ID, Job Title, Job Function, Job Role, Job Level  