# 1. Downloading the Resume Corpus

In [1]:
!git clone https://github.com/florex/resume_corpus.git
!mv resume_corpus resume_corpus_repo
!mkdir corpus && unzip resume_corpus_repo/resumes_corpus.zip -d corpus
!pip install scikit-multilearn

Cloning into 'resume_corpus'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 50 (delta 16), reused 0 (delta 0), pack-reused 23[K
Unpacking objects: 100% (50/50), done.
mkdir: cannot create directory ‘corpus’: File exists


# 2. Reading the corpus

In [2]:
import os
import pandas as pd

resume_text, resume_labels = {}, {}
for filename in os.listdir('corpus'):
  index = filename.replace('.lab', '').replace('.txt', '')
  with open('corpus/' + filename, encoding='latin1') as f:
    if filename.endswith('txt'):
      resume_text[index] = f.read()
    elif filename.endswith('lab'):
      resume_labels[index] = f.read().splitlines()

txt_series = pd.Series(resume_text, name='text')
lbl_series = pd.Series(resume_labels, name='labels')
df = pd.concat([txt_series, lbl_series], axis=1)

# 3. One-Hot Encoding

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
one_hot_encoding = pd.DataFrame(mlb.fit_transform(df['labels']), index=df.index, columns=mlb.classes_)
df = pd.concat([df.drop('labels', 1), one_hot_encoding], axis=1)

  after removing the cwd from sys.path.


# 4. Sampling

In [4]:
def check_dataframe():
  row_sum=df.iloc[:,1:].sum(axis=1)

  print("Total number of articles = ", len(df))
  print("Total number of articles without label = ", row_sum[row_sum==0].count())
  print("Total labels = ", row_sum.sum())
  print("\nCount by label:")
  print(df.iloc[:,1:].sum())
  print('-------------------------------------------')

check_dataframe()
df = df.sample(n=20000, random_state=1)
check_dataframe()

Total number of articles =  29783
Total number of articles without label =  748
Total labels =  52972

Count by label:
Database_Administrator     3299
Front_End_Developer        3977
Java_Developer             3252
Network_Administrator      4460
Project_manager            4550
Python_Developer           2836
Security_Analyst           3022
Software_Developer        15013
Systems_Administrator      5969
Web_Developer              6594
dtype: int64
-------------------------------------------
Total number of articles =  20000
Total number of articles without label =  473
Total labels =  35554

Count by label:
Database_Administrator     2244
Front_End_Developer        2677
Java_Developer             2158
Network_Administrator      2970
Project_manager            3073
Python_Developer           1897
Security_Analyst           2032
Software_Developer        10074
Systems_Administrator      4014
Web_Developer              4415
dtype: int64
-------------------------------------------


# 5. Text Cleanup

In [5]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def remove_html_tags(text):
  return re.sub("<.*?>", "", text)

def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

def remove_non_alphabetical_symbols(text):
    text = re.sub("[^a-zA-Z]", " ", text) 
    return ' '.join(text.split()).lower()

def stemming(text):
    return ' '.join([stemmer.stem(x) for x in text.split()])
 
df['text'] = df['text']\
              .apply(remove_html_tags)\
              .apply(remove_stopwords)\
              .apply(remove_non_alphabetical_symbols)\
              .apply(stemming)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 6. Training and Testing Models

In [6]:
from sklearn.model_selection import train_test_split

X, Y = df['text'], df.iloc[:,1:]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [7]:
import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import hamming_loss, accuracy_score, classification_report, multilabel_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from sklearn.linear_model import LogisticRegression

def evaluate_model(pipeline):
  print('----------------------------------------------------------------------')
  print("Initializing model training at", datetime.datetime.now())
  pipeline.fit(x_train, y_train)
  print("Initializing model testing at", datetime.datetime.now())
  predictions = pipeline.predict(x_test)
  print('MultiLabel Confusion Matrix:')
  print(multilabel_confusion_matrix(y_test, predictions))
  print('Accuracy (Exact Match Ratio) Score =', accuracy_score(y_test, predictions))
  print('Hamming Loss =', hamming_loss(y_test, predictions))
  print('Classification Report:')
  print(classification_report(y_test, predictions, digits=4, zero_division=0, target_names=y_test.keys()))
  print('----------------------------------------------------------------------')
 
evaluate_model(Pipeline([('tfidf', TfidfVectorizer(lowercase=False, ngram_range=(1, 1))),('brlr', BinaryRelevance(LogisticRegression(solver='sag', random_state=1)))]))
evaluate_model(Pipeline([('tfidf', TfidfVectorizer(lowercase=False, ngram_range=(1, 1))),('cclr', ClassifierChain(LogisticRegression(solver='sag', random_state=1)))]))
evaluate_model(Pipeline([('tfidf', TfidfVectorizer(lowercase=False, ngram_range=(1, 1))),('lbps', LabelPowerset(LogisticRegression(max_iter=200, random_state=1)))]))

----------------------------------------------------------------------
Initializing model training at 2022-08-21 22:10:09.476122
Initializing model testing at 2022-08-21 23:05:10.958220
MultiLabel Confusion Matrix:
[[[3524   16]
  [ 141  319]]

 [[3387   55]
  [ 134  424]]

 [[3542   33]
  [ 104  321]]

 [[3356   77]
  [ 196  371]]

 [[3354   53]
  [ 191  402]]

 [[3621   20]
  [  79  280]]

 [[3562   29]
  [ 119  290]]

 [[1928   72]
  [ 115 1885]]

 [[3106  112]
  [ 220  562]]

 [[2934  191]
  [ 283  592]]]
Accuracy (Exact Match Ratio) Score = 0.56075
Hamming Loss = 0.056
Classification Report:
                        precision    recall  f1-score   support

Database_Administrator     0.9522    0.6935    0.8025       460
   Front_End_Developer     0.8852    0.7599    0.8177       558
        Java_Developer     0.9068    0.7553    0.8241       425
 Network_Administrator     0.8281    0.6543    0.7310       567
       Project_manager     0.8835    0.6779    0.7672       593
      Pytho