In [218]:
import pandas as pd
import json
import dateparser
from datetime import datetime
import math
import re
from collections import Counter
from tqdm import tqdm
import string
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix

# Data Processing

### Data Loading

In [2]:
class CandidateRecord():
    def __init__(self, educations, experiences, seniority):
        self.educations = educations
        self.experiences = experiences
        self.seniority = seniority

class EducationRecord():
    def __init__(self, school, description, degree, time):
        self.school = school
        self.description = description
        self.degree = degree
        self.time = time

class ExperienceRecord():
    def __init__(self, work, description, title, time, skills):
        self.employer = work
        self.description = description
        self.title = title
        self.skills = skills
        self.time = time


In [3]:
train_filepath = "./data/seniority.train"
test_filepath = "./data/seniority.test"

def load_data(filepath):
    candidates = []
    with open(train_filepath, "r") as train_file:
        for line in train_file:
            record = json.loads(line.strip())
            educations = [EducationRecord(e['school'], e['description'], e['degree'], e['time']) for e in record['education']]
            experiences =[ExperienceRecord(e['work'], e['description'], e['title'], e['time'], e['skills']) for e in record['experience']]
            candidate = CandidateRecord(educations, experiences, record['seniority_level'])
            candidates.append(candidate)

    return candidates

train_candidates = load_data(train_filepath)
test_candidates = load_data(test_filepath)



### Data Exploration

In [4]:
len(train_candidates), len(test_candidates)

(29969, 29969)

In [5]:
train_sen = [c.seniority for c in train_candidates]
test_sen = [c.seniority for c in test_candidates]

In [7]:
Counter(train_sen), Counter(test_sen)

(Counter({'Entry': 9927,
          'Senior': 3564,
          'Mid-Level': 8947,
          'Intern': 928,
          'Manager': 3892,
          'Director': 1768,
          None: 294,
          'Vice President': 447,
          'CXO': 202}),
 Counter({'Entry': 9927,
          'Senior': 3564,
          'Mid-Level': 8947,
          'Intern': 928,
          'Manager': 3892,
          'Director': 1768,
          None: 294,
          'Vice President': 447,
          'CXO': 202}))

### Data Cleaning

#### Time data

Time data seems to have various formats and arbitrary values. We will have to clean these to extract duration of work experience or figure out if the candidate has completed their degree.

The following code was written iteratively:
- First I eye-balled the time data for various samples
- Then I looked at text-only time data
- Then I looked at unknown time data (empty or "n/a" or "unknown")
- Then I added education/experience count to check if total count makes sense
    - This revealed that for some education/experience records, we have a single time



In [9]:
error_times = set()
textual_times = set()
textual_times_count = 0
empty_times_count = 0
total_times_count = 0
education_count = 0
for c in train_candidates:
    for e in c.educations:
        education_count += 1
        if e.time == []:
            empty_times_count += 2
        for time in e.time:
            total_times_count += 1
            try:
                if bool(re.match(r'^\D*$', time)):
                    textual_times.add(time)
                    textual_times_count += 1
            except Exception as e:
                error_times.add(time)

textual_times, textual_times_count, empty_times_count, total_times_count, education_count

({'', 'notKnown'}, 17054, 22866, 72403, 48242)

In [10]:
error_times = set()
textual_times = set()
textual_times_count = 0
empty_times_count = 0
total_times_count = 0
unknown_times_count = 0
experience_count = 0
for c in train_candidates:
    for e in c.experiences:
        experience_count += 1
        if e.time == []:
            empty_times_count += 2
            unknown_times_count += 2
        for time in e.time:
            total_times_count += 1
            try:
                if time in ['', 'N/A', 'notKnown']:
                    unknown_times_count += 1
                if bool(re.match(r'^\D*$', time)):
                    textual_times.add(time)
                    textual_times_count += 1
            except Exception as e:
                error_times.add(time)

textual_times, unknown_times_count, textual_times_count, empty_times_count, total_times_count, experience_count

({'',
  'Aujourd’hui',
  'Current',
  'N/A',
  'Present',
  'Presente',
  'Till Date',
  'current',
  'notKnown',
  'present'},
 989,
 27095,
 28,
 293478,
 152830)

So, for ~12% (0.5 * 23k/95k) of the education data and for ~0.16% (0.5 * 1k/300k) of the experience data, we do not have a valid time.

Assumptions:
1. I think we can ignore the time of graduation altogether since we do not have plan to extract any feature based on it. If we do want to check if, say, the person has completed their degree, then we would need a valid end time.

2. We can treat the 0.16% missing times for experience data as missing data. Can choose to ignore it considering its low volume.

3. I ran dateparser.parse() on all the above times and it ran successfully. Thus, I am assuming we either have valid dates or "text-only" dates that are shown above. If there is any date like "2022abc", then we count it as just another invalid date.

In [11]:
a = dateparser.parse("2022abc")
a == None

True

### College Degree 

In [29]:
degrees = []
for c in train_candidates:
    for e in c.educations:
        degrees.append(e.degree)
    
len(degrees), degrees[:10]

(48242,
 ['Master of Business Administration - MBA',
  'Bachelors of Science in Business Administration with a Concentration in Marketing',
  '',
  'Master of Science in Finance',
  'B.A in Applied Mathematics',
  'GED',
  'M.S.',
  'Bachelor of Science',
  'Bachelor of Arts',
  ''])

Need to remove punctuation to get terms like "be", "bsc", "msc" instead of "b", "e", "sc", "m"

The above list is also cluttered by department names. We can use a standand list of departments at stop words to fix this.

In [66]:
# Source: https://www.mydegreeguide.com/types-of-degrees/types-of-bachelor-degree/
# The dept list given above is augmented with the individual tokens found in the dept names itself e.g. "business administration" => "business", "administration"
depts = ["architecture", "arts", "business", "business administration", "science in business", "canon law", "computer science", "science in computer science", "criminal justice", "science in criminal justice", "divinity", "education", "science in education", "wireless engineering", "engineering", "science in engineering", "science in aerospace engineering", "science in agricultural engineering", "science in biological systems", "science in biosystems and agricultural engineering", "science in biological engineering", "biomedical engineering", "science in biomedical engineering", "science in chemical engineering", "science in chemical and biomolecular engineering", "science in chemical and materials engineering", "civil engineering", "science in civil engineering", "science in civil and infrastructure engineering", "computer engineering", "science in computer engineering", "science in computer science and engineering", "science in electrical and computer engineering", "electrical engineering", "science in electrical engineering", "science in engineering management", "science in environmental engineering", "fiber engineering", "science in industrial engineering", "science in manufacturing engineering", "science in manufacturing systems engineering", "science in materials science and engineering", "science in materials engineering", "mechanical engineering", "science in mechanical engineering", "science in metallurgical engineering", "science in mining engineering", "science in systems", "software engineering", "science in software engineering", "systems engineering", "science in systems engineering", "engineering technology", "science in engineering technology", "science in civil engineering technology", "science in computer engineering technology", "science in construction engineering technology", "science in drafting design technology", "science in electrical/electronics technology", "science in electrical engineering technology", "science in electro-mechanical engineering technology", "science in mechanical engineering technology", "fine arts", "forestry", "science in forest research", "hebrew letters", "journalism", "laws", "liberal studies", "literature", "marine science", "music", "nursing", "science in nursing", "pharmacy", "philosophy", "religious education", "science", "science in chemistry", "technology", 'administration', 'aerospace', 'agricultural', 'and', 'biological', 'biomedical', 'biomolecular', 'biosystems', 'canon', 'chemical', 'chemistry', 'civil', 'computer', 'construction', 'criminal', 'design', 'drafting', 'electrical', 'electro', 'electronics', 'environmental', 'fiber', 'fine', 'forest', 'hebrew', 'in', 'industrial', 'infrastructure', 'justice', 'law', 'letters', 'liberal', 'management', 'manufacturing', 'marine', 'materials', 'mechanical', 'metallurgical', 'mining', 'religious', 'research', 'software', 'studies', 'systems', 'wireless']

In [70]:
degrees = [s.translate(str.maketrans('', '', string.punctuation)) for s in degrees]

degree_vectorizer = CountVectorizer(stop_words=depts + ["of"], ngram_range=(1, 2), max_features=50)
cv_fit = degree_vectorizer.fit_transform(degrees)
popular_degrees = degree_vectorizer.get_feature_names_out()
popular_degree_freq = cv_fit.toarray().sum(axis=0)

sorted(zip(popular_degree_freq, popular_degrees), reverse=True)

[(10343, 'bachelor'),
 (4202, 'master'),
 (3740, 'degree'),
 (2705, 'bs'),
 (2643, 'diploma'),
 (2572, 'bachelors'),
 (1947, 'mba'),
 (1889, 'masters'),
 (1653, 'ba'),
 (1434, 'certificate'),
 (1403, 'ms'),
 (1370, 'school'),
 (1265, 'high'),
 (1235, 'high school'),
 (1162, 'bachelors degree'),
 (1085, 'information'),
 (1016, 'school diploma'),
 (866, 'associate'),
 (708, 'bachelor degree'),
 (707, 'associates'),
 (690, 'masters degree'),
 (648, 'certification'),
 (629, 'marketing'),
 (565, 'be'),
 (546, 'economics'),
 (514, 'applied'),
 (509, 'bachelor bs'),
 (477, 'phd'),
 (477, 'bsc'),
 (473, 'finance'),
 (418, 'general'),
 (396, 'master ms'),
 (394, 'international'),
 (386, 'commerce'),
 (376, 'btech'),
 (363, 'communication'),
 (346, 'mathematics'),
 (345, 'accounting'),
 (336, 'communications'),
 (301, 'ged'),
 (300, 'master degree'),
 (294, 'associates degree'),
 (286, 'bba'),
 (277, 'certified'),
 (277, 'bachelor commerce'),
 (274, 'psychology'),
 (271, 'msc'),
 (269, 'doctor')


This looks good!

In [119]:
sum(degree_vectorizer.transform(["bs", "ms"])).A

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0]], dtype=int64)


For titles, we don't need to remove any punctuation or other terms. We can simply count the n-grams.

In [50]:
titles = []
for c in train_candidates:
    for e in c.experiences:
        titles.append(e.title)
    
len(titles), titles[:50]

(152830,
 ['General Laborer',
  'Produce Associate',
  'Dietary Aide',
  'Sr. Supply Planner',
  'Supply Chain Manager',
  'Master Scheduler',
  'Master Scheduler',
  'Master Scheduler',
  'Master Scheduler',
  'Material Resource Planner',
  'Utility Worker',
  'Ramp Agent',
  'Lead',
  'Roasting lead',
  'Second Night Lead',
  '',
  'Fixed Income & Absolute Return Team Intern',
  'Real Estate Team Intern',
  'Financial Service Department Intern',
  'Shipping Clerk',
  'Unemployed',
  'Lift Driver/Module Operator',
  'Unemployed',
  'Order Picker/Unloader',
  'Global Environmental Expert',
  'Global Environmental Program Manager',
  'Global Environmental Program Manager',
  'Senior Health, Safety & Environment Manager',
  'Environmental Specialist, Latin America',
  'Manager, Safety & Environment',
  'Senior Pricing Manager',
  'Sr. Pricing & Wireless Solution Manager',
  'International Market Systems Engineer',
  '',
  'Engineering Section Manager',
  'Senior Systems Engineer',
  'Sen

In [104]:
title_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3), max_features=100)
cv_fit = title_vectorizer.fit_transform(titles)
popular_titles = title_vectorizer.get_feature_names_out()
popular_title_freq = cv_fit.toarray().sum(axis=0)

sorted(zip(popular_title_freq, popular_titles), reverse=True)

[(28073, 'manager'),
 (13544, 'engineer'),
 (11089, 'senior'),
 (9008, 'sales'),
 (8814, 'analyst'),
 (8733, 'software'),
 (8295, 'assistant'),
 (8027, 'intern'),
 (7810, 'director'),
 (6626, 'consultant'),
 (6365, 'business'),
 (5754, 'software engineer'),
 (5445, 'marketing'),
 (5143, 'project'),
 (4796, 'account'),
 (4630, 'associate'),
 (4569, 'development'),
 (4518, 'lead'),
 (4214, 'executive'),
 (4020, 'specialist'),
 (3583, 'operations'),
 (3425, 'developer'),
 (3356, 'sr'),
 (3039, 'coordinator'),
 (3038, 'research'),
 (3036, 'project manager'),
 (2894, 'management'),
 (2880, 'technical'),
 (2799, 'product'),
 (2697, 'customer'),
 (2599, 'services'),
 (2596, 'service'),
 (2574, 'supervisor'),
 (2494, 'data'),
 (2371, 'engineering'),
 (2329, 'account manager'),
 (2325, 'team'),
 (2292, 'support'),
 (2261, 'representative'),
 (2228, 'president'),
 (2111, 'program'),
 (2023, 'systems'),
 (1965, 'financial'),
 (1844, 'technician'),
 (1831, 'operator'),
 (1786, 'business developmen

This list looks decent. Most of the terms give an indication of the seniority of the corresponding role. I tried `max_features = 200` and saw not so informative terms at the end, so switched to 100 and we can see that the last few terms also have good indicative information.

In [122]:
sum(title_vectorizer.transform(["software engineer", "machine learning engineer"])).A

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [128]:
skills = []
for c in train_candidates:
    for e in c.experiences:
        if e.skills:
            skills += e.skills
    
len(skills), skills[:50]

(1361721,
 ['production',
  'labor',
  'floors',
  'safety',
  'damages',
  'customer service',
  'cleaning',
  'truck',
  'holidays',
  'fruit',
  'associate',
  'display',
  'production',
  'breakfast',
  'lunch',
  'customer service',
  'supply chain',
  'replenishment',
  'team',
  'operational metrics',
  'wip',
  'metrics',
  'components',
  'customer service',
  'supply chain',
  'replenishment',
  'team',
  'operational metrics',
  'metrics',
  'components',
  'wip',
  'range planning',
  'manufacturing',
  'projects',
  'planning',
  'material planning',
  'process',
  'procurement',
  'customer service',
  'manufacturing',
  'diverse team',
  'project',
  'inventory accuracy',
  'delivery',
  'price',
  'purchasing',
  'buyer',
  'vendors',
  'planning',
  'inventory'])

In [138]:
skills_vectorizer_unigram = CountVectorizer(max_features=200)
cv_fit = skills_vectorizer_unigram.fit_transform(skills)
popular_skills = skills_vectorizer_unigram.get_feature_names_out()
popular_skill_freq = cv_fit.toarray().sum(axis=0)

sorted(zip(popular_skill_freq, popular_skills), reverse=True)

[(51347, 'management'),
 (44808, 'sales'),
 (24902, 'business'),
 (23901, 'marketing'),
 (19089, 'data'),
 (17660, 'process'),
 (17017, 'analysis'),
 (15267, 'development'),
 (14516, 'customer'),
 (14225, 'operations'),
 (13284, 'training'),
 (11384, 'strategy'),
 (10694, 'financial'),
 (10072, 'service'),
 (10001, 'quality'),
 (9407, 'product'),
 (9321, 'planning'),
 (9285, 'clients'),
 (9098, 'software'),
 (8656, 'reports'),
 (8403, 'inventory'),
 (8287, 'solutions'),
 (8250, 'projects'),
 (8218, 'team'),
 (8144, 'project'),
 (7804, 'design'),
 (7773, 'production'),
 (7720, 'leadership'),
 (7555, 'implementation'),
 (7345, 'revenue'),
 (7197, 'research'),
 (6828, 'maintenance'),
 (6720, 'budget'),
 (6552, 'web'),
 (6545, 'support'),
 (6355, 'engineering'),
 (6355, 'cost'),
 (6320, 'application'),
 (6313, 'services'),
 (6046, 'employees'),
 (6018, 'integration'),
 (5894, 'media'),
 (5736, 'safety'),
 (5702, 'database'),
 (5544, 'office'),
 (5361, 'delivery'),
 (5348, 'tools'),
 (5311,

If we looked at skills n-grams in range (1, 3), we notice that they are dominated by unigrams. Thus, I have decided to take 200 unigrams and 100 bi/tri-grams (most of which are bigrams).

In [139]:
skills_vectorizer_bitri = CountVectorizer(max_features=100, ngram_range=(2, 3))
cv_fit = skills_vectorizer_bitri.fit_transform(skills)
popular_skills = skills_vectorizer_bitri.get_feature_names_out()
popular_skill_freq = cv_fit.toarray().sum(axis=0)

sorted(zip(popular_skill_freq, popular_skills), reverse=True)

[(4944, 'customer service'),
 (2753, 'social media'),
 (2602, 'business development'),
 (2275, 'supply chain'),
 (2120, 'cross functional'),
 (1738, 'project management'),
 (1723, 'sales team'),
 (1578, 'product development'),
 (1390, 'customer satisfaction'),
 (1273, 'human resources'),
 (1245, 'business requirements'),
 (1239, 'quality assurance'),
 (1204, 'management system'),
 (1193, 'data analysis'),
 (1183, 'process improvement'),
 (1141, 'software development'),
 (1126, 'business process'),
 (1087, 'change management'),
 (1061, 'continuous improvement'),
 (1006, 'account management'),
 (992, 'technical support'),
 (960, 'quality control'),
 (935, 'market research'),
 (880, 'business intelligence'),
 (877, 'performance management'),
 (851, 'risk management'),
 (818, 'business units'),
 (790, 'in house'),
 (784, 'professional services'),
 (782, 'product management'),
 (746, 'employee relations'),
 (742, 'senior management'),
 (736, 'market share'),
 (716, 'strategic planning'),
 (

In [143]:
sum(skills_vectorizer_unigram.transform(["technology", "engineer", "communication"])).A[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

We can use the tokenizer from Spacy if needed. Note we are using the English tokenizer here but it seems to do a decent job on French sentences too.

I tried using this tokenizer instead of the default tokenizer in CountVectorizer but the combination was not playing well around punctuations. And the default tokenizer does a decent job anyway, so I decided to keep it as is.

In [84]:
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

doc = tokenizer("inventory.\n\u25cf Unloaded freights/pallets from trailers.\n\u25cf Stocked and organized shelves according to procedures and policies.\n\u25cf Staged promotional")
frenchdoc = tokenizer("\u2022 V\u00e9rifier les ingr\u00e9dients d'adh\u00e9sif pr\u00e9par\u00e9s par le journalier (deuxi\u00e8me compte des ingr\u00e9dients).\n\u2022 Ins\u00e9rer les ingr\u00e9dients dans le m\u00e9langeur selon la proc\u00e9dure \u00e9tablie, afin d'obtenir un produit de premi\u00e8re qualit\u00e9 et ainsi rencontrer les normes de tests \u00e9tablies.\n\u2022 Proc\u00e9der au set-up de la ligne d'extrusion, c'est-\u00e0-dire couteau, rouleau, ")


In [90]:
doc, len(doc), doc[0], type(doc[0])

(inventory.
 ● Unloaded freights/pallets from trailers.
 ● Stocked and organized shelves according to procedures and policies.
 ● Staged promotional,
 27,
 inventory,
 spacy.tokens.token.Token)

In [87]:
frenchdoc, len(frenchdoc), frenchdoc[1]

(• Vérifier les ingrédients d'adhésif préparés par le journalier (deuxième compte des ingrédients).
 • Insérer les ingrédients dans le mélangeur selon la procédure établie, afin d'obtenir un produit de première qualité et ainsi rencontrer les normes de tests établies.
 • Procéder au set-up de la ligne d'extrusion, c'est-à-dire couteau, rouleau, ,
 66,
 Vérifier)

# Feature Engineering

Here are some features we can consider for a classification task:
1. Years of Work Experience
    - Fairly straightforward numerical feature
    - We will merge overlapping work experiences for simplicity -- can perhaps do a weighted sum in some fashion
2. Education level
    - We can have a feature corresponding to an education at each level (certificate, diploma, associate, bachelor, master, phd)
    - We can also use a CounterVectorizer -- that should capture the above unigram terms
3. Job Title is a strong indicator
    - Look for the presence of certain terms, perhaps perform one-hot encoding on the n-grams
4. Skills
    - Can create feature vectors from skills -- same as above, we can start with a CountVectorizer for the n-grams
    - Skills are specific to the industry/field but can be a good indicator of seniority.
5. Work and education descriptions
    - Can chunk the descriptions and create averaged LLM embeddings of the descriptions
6. Company and College ranking based features
    - If we have some ranking of various companies, we can perhaps create a feature out of this
    - Manager or CEO at Google probably denotes higher seniority than counterparts at other companies


*Let's start with the first four features and see how it goes*

### Years of Work Experience

In [12]:
a = datetime(2022, 10, 10)
b = datetime(2021, 2, 2)
a < b

False

In [176]:
# Possible values of time: {'', 'Aujourd’hui', 'Current', 'N/A', 'Present', 'Presente', 'Till Date', 'current', 'notKnown', 'present'}
unknown_time_values = set(['', 'notKnown', 'N/A', None])
today_time_values = set(['Aujourd’hui', 'Current', 'Present', 'Presente', 'Till Date', 'current', 'present'])


def merge_overlapping_intervals(array):
    array.sort() # sorts by 1st element of the 2-item list
    merged = [array[0]]
    for current in array:
        previous = merged[-1]
        if current[0] <= previous[1]:
            previous[1] = max(previous[1], current[1])
        else:
            merged.append(current)

    return merged


def extract_yoe(candidate: CandidateRecord):
    work_time_ranges = []
    for e in candidate.experiences:
        # If insufficient/invalid times are recorded for a given experience, we ignore them
        if len(e.time) < 2 or any([t in unknown_time_values for t in e.time]):
            continue
        start_time = dateparser.parse(e.time[0]) if e.time[0] not in today_time_values else dateparser.parse('today')
        end_time = dateparser.parse(e.time[1]) if e.time[1] not in today_time_values else dateparser.parse('today')
        work_time_ranges.append([start_time, end_time])
    

    if len(work_time_ranges) == 0:
        return 0
    else:
        work_time_ranges = merge_overlapping_intervals(work_time_ranges)
        yoe_in_months = sum([math.ceil((w[1] - w[0]).days / 30) for w in work_time_ranges]) 
        return yoe_in_months



### Creating feature vectors for train and test sets

In [179]:
# For each candidate, we will extract the features and then create a DataFrame from the records
# Length of feature vector = 1 (yoe) + 50 (degree) + 100 (title) + 200 (skills_uni) + 100 (skills_bitri) = 451
from tqdm import tqdm

def extract_features(c):
    c_vector = []

    c_vector.append(extract_yoe(c))

    degrees = [e.degree for e in c.educations]
    degrees = [s.translate(str.maketrans('', '', string.punctuation)) for s in degrees]
    c_vector += sum(degree_vectorizer.transform(degrees)).A[0].tolist() if degrees else [0]*50
    titles = [e.title for e in c.experiences]
    c_vector += sum(title_vectorizer.transform(titles)).A[0].tolist() if titles else [0]*100

    c_skills = []
    for e in c.experiences:
        if e.skills:
            c_skills += e.skills
    c_vector += sum(skills_vectorizer_unigram.transform(c_skills)).A[0].tolist() if c_skills else [0]*200
    c_vector += sum(skills_vectorizer_bitri.transform(c_skills)).A[0].tolist() if c_skills else [0]*100

    return c_vector
    

train_df = []
for idx, c in enumerate(tqdm(train_candidates)):
    try:
        train_df.append(extract_features(c))
    except Exception as e:
        print("Encountered error while parsing train candidate:", idx)
        print(e)

test_df = []
for idx, c in enumerate(tqdm(test_candidates)):
    try:
        test_df.append(extract_features(c))
    except Exception as e:
        print("Encountered error while parsing test candidate:", idx)
        print(e)
    

100%|██████████| 29969/29969 [24:43<00:00, 20.21it/s]  
100%|██████████| 29969/29969 [34:31<00:00, 14.47it/s] 


In [180]:
len(train_df), len(train_df[0])

(29969, 451)

### Create train and test datasets

In [203]:
label_map = {
    None: -1,
    "Intern": 0,
    "Entry": 1,
    "Mid-Level": 2,
    "Senior": 3,
    "Manager": 4,
    "Director": 5,
    "Vice President": 6,
    "CXO": 7
}

y_train = [label_map[c.seniority] for c in train_candidates]
y_test = [label_map[c.seniority] for c in test_candidates]

In [204]:
bad_train_indices = [idx for idx, label in enumerate(y_train) if label == -1]
bad_test_indices = [idx for idx, label in enumerate(y_test) if label == -1]

len(bad_train_indices), len(bad_test_indices)

(294, 294)

In [200]:
X_train = pd.DataFrame(train_df)
X_test = pd.DataFrame(test_df)

X_train = X_train.drop(index=bad_train_indices)
X_test = X_test.drop(index=bad_test_indices)


In [205]:
y_train = [label for idx, label in enumerate(y_train) if idx not in bad_train_indices]
y_test = [label for idx, label in enumerate(y_test) if idx not in bad_test_indices]

In [206]:

X_train.shape, X_test.shape, len(y_train), len(y_test)

((29675, 451), (29675, 451), 29675, 29675)

# Model Training

In [250]:
clf = XGBClassifier(n_estimators=300, max_depth=15, learning_rate=0.03, n_jobs=-1, gpu_id=0, tree_method="gpu_hist")
clf.fit(X_train, y_train)

# Evaluation

In [251]:
train_preds = clf.predict(X_train)
accuracy_score(y_train, train_preds), balanced_accuracy_score(y_train, train_preds)

(0.978904802021904, 0.9793757609132243)

In [252]:
preds = clf.predict(X_test)


In [253]:

accuracy_score(y_test, preds), balanced_accuracy_score(y_test, preds)

(0.978904802021904, 0.9793757609132243)

In [254]:
confusion_matrix(y_test, preds)

array([[ 903,   21,    2,    2,    0,    0,    0,    0],
       [  13, 9649,  232,   10,   21,    1,    1,    0],
       [   2,  132, 8694,   39,   71,    9,    0,    0],
       [   0,    6,   10, 3542,    5,    1,    0,    0],
       [   0,    6,   15,    1, 3869,    1,    0,    0],
       [   0,    4,    2,    1,    1, 1760,    0,    0],
       [   0,    3,    2,    1,    1,    0,  440,    0],
       [   0,    2,    7,    0,    1,    0,    0,  192]], dtype=int64)