# Experiment using LSTM

## Library Installing and Importing

In [1]:
#!pip install -r requirements.txt
#!pip install pandas
#!pip install numpy
#!pip install nltk
#!pip install tensorflow
#!pip install sklearn
#!pip install matplotlib

In [2]:
#import nltk
#nltk.download()

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

## Data Exploration

In [4]:
df = pd.read_csv('Resume.csv')

In [5]:
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
df = df[['Category','Resume_str']]
df.head()

Unnamed: 0,Category,Resume_str
0,HR,HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1,HR,"HR SPECIALIST, US HR OPERATIONS ..."
2,HR,HR DIRECTOR Summary Over 2...
3,HR,HR SPECIALIST Summary Dedica...
4,HR,HR MANAGER Skill Highlights ...


In [7]:
df['Category'] = df['Category'].astype('category')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Category    2484 non-null   category
 1   Resume_str  2484 non-null   object  
dtypes: category(1), object(1)
memory usage: 22.7+ KB


In [9]:
df.value_counts()

Category              Resume_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

## Preprocessing

In [10]:
df['Text'] = df['Resume_str']

In [11]:
import string
def clean_text(series):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_texts = []
    for text in series:
        # Tokenization
        tokens = word_tokenize(text.lower())
        # Removing punctuation and numeric values
        no_punct_tokens = [token for token in tokens if token not in string.punctuation and not token.isnumeric()]
        # Removing stop words
        no_stopwords_tokens = [token for token in no_punct_tokens if token not in stop_words]
        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in no_stopwords_tokens]
        # Join tokens back into a string
        cleaned_text = ' '.join(lemmatized_tokens)
        cleaned_texts.append(cleaned_text)
    return cleaned_texts

In [12]:
df['Text'] = clean_text(df['Text'])

In [13]:
df.head()

Unnamed: 0,Category,Resume_str,Text
0,HR,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,hr administrator/marketing associate hr admini...
1,HR,"HR SPECIALIST, US HR OPERATIONS ...",hr specialist u hr operation summary versatile...
2,HR,HR DIRECTOR Summary Over 2...,hr director summary year experience recruiting...
3,HR,HR SPECIALIST Summary Dedica...,hr specialist summary dedicated driven dynamic...
4,HR,HR MANAGER Skill Highlights ...,hr manager skill highlight hr skill hr departm...


### Dataset splitting

In [14]:
train_sentences, test_sentence, train_labels, test_label = train_test_split(df["Text"].to_numpy(),
                                                                            df["Category"].to_numpy(),
                                                                            test_size=0.2, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [15]:
# Check the lengths
len(train_sentences), len(train_labels), len(test_label), len(test_sentence)

(1987, 1987, 497, 497)

In [16]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_sentences,
                                                                            train_labels,
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42)

In [17]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels), len(test_sentence), len(test_label)

(1788, 1788, 199, 199, 497, 497)

In [18]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

595

### Text-Vectorization

In [19]:
from tensorflow.keras.layers import TextVectorization
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 595 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [20]:
text_vectorizer.adapt(train_sentences)

In [21]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
bartender experience bartender 05/2018 company name – city state marketing public relation year maintain proper adequate set-up bar daily basis responsible maintaining stock preparing storing garnish juice perishable ensure product quality attend stand ups prior function obtain detail order execute event flawlessly set breakdown mobile bar per hotel standard greets guest warm genuine hospitality promote up-sells product handle production consumption sheet assist cash bar set closing process perform general cleaning task adhere health department regulation bartender byron nelson golf tournament may dallas texas greet guest stock bar complete pre-opening requirement maintain clean bar area prepare serve alcoholic beverage according standard recipe comply basic safety sanitation requirement organizational product operate club 's po work large small event minimal supervision team environment complete pre-closing requirement assure state local law club policy procedure servic

<tf.Tensor: shape=(1, 595), dtype=int64, numpy=
array([[3084,   20, 3084, 5008,    3,    6,  165,    4,    2,   29,  130,
          98,   47,  115,  297, 1905,  879, 1159,   80,  393,   57,  266,
         731,  525, 5387, 2974, 7203, 8151,   70,   26,   48, 1436, 2908,
        2442, 1015,  283,  817,  328,   78, 1123,   77, 8322,  300, 3357,
        1016, 1159,  289, 1030,  120, 6808,  367, 3158, 4803, 1597,  649,
           1,   26,  894,  109, 3077,  572,  178,  154, 1159,  300,  457,
          23,  436,  123, 1071,  225, 2781,  100,   53,  330, 3084, 6905,
        6761, 2025, 2947,  177, 2290,  848, 1828,  367,  731, 1159,  369,
           1,  136,  115,  609, 1159,  104,  240,  795, 3147,  887,  615,
         120, 1209, 2031,  395,  110,  984,  136,  215,   26, 1302,  488,
          22, 1021,   25,  565,  659,   77, 2765,  597,   13,  137,  369,
           1,  136, 2267,    2,  277,  341,  488,   65,   56,    8, 3147,
         887,  453,  908,  119,  367, 1800,  887,  102, 1691, 23

### Vocabulary Exploration

In [22]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'state', 'company', 'city']
Bottom 5 least common words: ['groomed', 'grir', 'greensboro', 'grandmother', 'grace']


### Embedding

In [23]:
from tensorflow.keras import layers

In [24]:
embedding = layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_length,embeddings_initializer="uniform",name="embedding_1")
embedding

<keras.src.layers.core.embedding.Embedding at 0x22c27c85050>

In [25]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
g healthcare recruiter skill ad autism benefit billing coach coaching client client data processing delivery driver filing financial statement hiring inventory managing meeting access office network networking payroll personal coaching presentation pricing psychology recruiting researching research sale seminar staffing phone workshop experience healthcare recruiter city state company name jul mar managed full cycle recruiting process meet staffing goal developed strong relationship partnered hiring manager maximize effectiveness recruiting process tracked reported key metric partnered hiring manager interview team ensure job requirement clearly understood presented candidate coordinated interview process built network pipeline healthcare professional matched hospital partner culture sourced screened candidate meet high volume open position professional recruiting specialist city state company name jun dec sought interviewed reference checked placed contract employee com

<tf.Tensor: shape=(1, 595, 128), dtype=float32, numpy=
array([[[-0.04262102, -0.02050346,  0.04417828, ..., -0.01312882,
         -0.02120622, -0.04865101],
        [-0.02862908,  0.00402002,  0.01230298, ...,  0.04039789,
         -0.02662076, -0.02035967],
        [-0.04343648,  0.01945369,  0.02147225, ..., -0.01585481,
          0.00538943,  0.0351915 ],
        ...,
        [ 0.04506316,  0.02121254,  0.00491712, ..., -0.00324609,
         -0.02950019,  0.00708749],
        [ 0.04506316,  0.02121254,  0.00491712, ..., -0.00324609,
         -0.02950019,  0.00708749],
        [ 0.04506316,  0.02121254,  0.00491712, ..., -0.00324609,
         -0.02950019,  0.00708749]]], dtype=float32)>

In [26]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.04262102, -0.02050346,  0.04417828, -0.00113196, -0.02694937,
       -0.0133898 ,  0.01660541, -0.0026873 , -0.03173751,  0.04982923,
        0.04340024,  0.01507382,  0.04156459, -0.0231294 ,  0.0035365 ,
       -0.02414248,  0.0224316 ,  0.02051516,  0.0175018 , -0.03597834,
        0.03619237,  0.00591649,  0.0206244 ,  0.04674444, -0.04331241,
        0.04669333, -0.01317254, -0.040807  , -0.0216924 ,  0.01455827,
       -0.04554909, -0.01081259,  0.04933539,  0.03068426,  0.00581487,
       -0.03090394,  0.03950337,  0.04312036,  0.03167242, -0.0452873 ,
       -0.00629815, -0.04019935,  0.02979029, -0.02090092,  0.01655101,
        0.00512508,  0.02155492,  0.04694438,  0.03098785, -0.00189707,
        0.01805428, -0.04998851,  0.0445906 ,  0.02075892, -0.01209487,
       -0.03162821, -0.01901965,  0.00246859, -0.0461311 , -0.00467088,
       -0.03375901, -0.04902704, -0.00069468,  0.02606202,  0.02133126,
       -0.030029

### Model Building

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
                    ])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [28]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 56.78%


In [29]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
print(baseline_preds[:20])
print(val_labels[:20])

['SALES' 'AVIATION' 'INFORMATION-TECHNOLOGY' 'FITNESS' 'PUBLIC-RELATIONS'
 'PUBLIC-RELATIONS' 'PUBLIC-RELATIONS' 'PUBLIC-RELATIONS'
 'PUBLIC-RELATIONS' 'BANKING' 'FINANCE' 'BUSINESS-DEVELOPMENT' 'CHEF'
 'BANKING' 'FINANCE' 'BUSINESS-DEVELOPMENT' 'FINANCE' 'TEACHER'
 'ENGINEERING' 'FINANCE']
['AUTOMOBILE' 'CONSULTANT' 'INFORMATION-TECHNOLOGY' 'FITNESS'
 'PUBLIC-RELATIONS' 'DIGITAL-MEDIA' 'DIGITAL-MEDIA' 'PUBLIC-RELATIONS'
 'ARTS' 'BANKING' 'FINANCE' 'BUSINESS-DEVELOPMENT' 'CHEF' 'APPAREL'
 'BANKING' 'PUBLIC-RELATIONS' 'AVIATION' 'TEACHER' 'AGRICULTURE' 'TEACHER']


In [30]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [31]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 56.78391959798995,
 'precision': 0.5880908623833776,
 'recall': 0.5678391959798995,
 'f1': 0.5215564615323471}

Further library importing

In [32]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [33]:
X = df['Text'].values
y = df['Category'].astype('object').values

### Encoding

In [34]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert numerical labels to one-hot encoded vectors
y_one_hot = to_categorical(y_encoded)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

# Create a TextVectorization layer
max_features = 5000
sequence_length = 100
vectorizer = TextVectorization(max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length)
vectorizer.adapt(X_train)

# Build the LSTM model
num_classes = len(label_encoder.classes_)

### Compute class weights based on class distribution

Because the dataset is highly imbalanced, we need highier weight for low sample data.

In [35]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights_dict = dict(enumerate(class_weights))

In [36]:
class_weights_dict

{0: 0.9302434456928839,
 1: 0.9408143939393939,
 2: 1.5053030303030304,
 3: 1.0752164502164503,
 4: 0.9740196078431372,
 5: 2.7597222222222224,
 6: 0.8624131944444444,
 7: 0.8999094202898551,
 8: 4.139583333333333,
 9: 0.8902329749103942,
 10: 0.8807624113475178,
 11: 1.061431623931624,
 12: 0.8714912280701754,
 13: 0.9408143939393939,
 14: 1.1660798122065728,
 15: 0.8535223367697594,
 16: 0.8362794612794613,
 17: 0.844812925170068,
 18: 0.8714912280701754,
 19: 0.8999094202898551,
 20: 0.8807624113475178,
 21: 0.8807624113475178,
 22: 0.9516283524904214,
 23: 1.0348958333333333}

### LSTM Model Building

In [37]:
model = Sequential([
    vectorizer,
    Embedding(input_dim=len(vectorizer.get_vocabulary()) + 1, output_dim=32, input_length=sequence_length),
    LSTM(64),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')  # Output categories based on number of classes
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, class_weight=class_weights_dict)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22c27c93f50>

In [39]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 2.6588, Test Accuracy: 0.2052


Thank you