In [1]:
from pymongo import MongoClient
import silence_tensorflow.auto
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import contractions
import re
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# initializing Stop words libraries
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
client = MongoClient('mongodb://localhost:27017/') 

# Access the database
db = client['job-resume-db']  

# Access the collection
collection = db['job-descriptions'] 

In [3]:
documents = collection.find({})  

In [4]:
job_descriptions_df = pd.DataFrame(documents)

In [5]:
job_descriptions_df.head()

Unnamed: 0,_id,index,category,description
0,649cd598eed36cf2eea926a8,0,product manager,Title: Mobile Marketing Executive Full Descrip...
1,649cd598eed36cf2eea926a9,1,product manager,Title: Product Manager IT Channel Harrogate **...
2,649cd598eed36cf2eea926aa,2,product manager,Title: PR Account Executive/PR Account Manager...
3,649cd598eed36cf2eea926ab,3,product manager,Title: Product Marketing Manager Full Descript...
4,649cd598eed36cf2eea926ac,4,product manager,Title: PR Account Manager Full Description: PR...


In [6]:
job_descriptions = job_descriptions_df['description']

In [7]:
# Open the text file and read the lines into a list
with open('noisy_words.txt', 'r') as f:
    words = f.readlines()

# Remove any newline characters from each word
noisy_words = [word.strip() for word in words]

In [8]:
def clean_text(text):
    desc = contractions.fix(text)
    # for word in noisy_words:
    #     desc = re.sub(r'\b' + word + r'\b', '', desc)
    desc = re.sub("[!@.$\'\'':()]", "", desc)
    tokens = nltk.word_tokenize(desc)
    cleaned_tokens = [word for word in tokens if word not in noisy_words]
    cleaned_description = ' '.join(cleaned_tokens)
    return cleaned_description

In [9]:
def tokenize_and_tag(desc):
    tokens = nltk.word_tokenize(desc.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words]
    tagged = nltk.pos_tag(filtered_tokens)
    return tagged

In [10]:
def extract_POS(tagged):
    #pattern 1
    grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
    chunkParser = nltk.RegexpParser(grammar1)
    tree1 = chunkParser.parse(tagged)

    # typical noun phrase pattern appending to be concatted later
    g1_chunks = []
    for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
        g1_chunks.append(subtree)

    #pattern 2
    grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
    chunkParser = nltk.RegexpParser(grammar2)
    tree2 = chunkParser.parse(tagged)

    # variation of a noun phrase pattern to be pickled for later analyses
    g2_chunks = []
    for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
        g2_chunks.append(subtree)

    #pattern 3
    grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''')
    chunkParser = nltk.RegexpParser(grammar3)
    tree3 = chunkParser.parse(tagged)

    # verb-noun pattern appending to be concatted later
    g3_chunks = []
    for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
        g3_chunks.append(subtree)


    # pattern 4
    # any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
    grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''')
    chunkParser = nltk.RegexpParser(grammar4)
    tree4 = chunkParser.parse(tagged)

    # common pattern of listing skills appending to be concatted later
    g4_chunks = []
    for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
        g4_chunks.append(subtree)

    return g1_chunks, g2_chunks, g3_chunks, g4_chunks

In [11]:
def training_set(chunks):
    '''creates a dataframe that easily parsed with the chunks data '''
    df = pd.DataFrame(chunks)
    df.fillna('X', inplace = True)

    train = []
    for row in df.values:
        phrase = ''
        for tup in row:
            # needs a space at the end for seperation
            phrase += tup[0] + ' '
        phrase = ''.join(phrase)
        # could use padding tages but encoder method will provide during
        # tokenizing/embeddings; X can replace paddding for now
        train.append( phrase.replace('X', '').strip())

    df['phrase'] = train

    #returns 50% of each dataframe to be used if you want to improve execution time
    # return df.phrase.sample(frac = 0.5)
    # Update: only do 50% if running on excel
    return df.phrase

def strip_commas(df):
    '''create new series of individual n-grams'''
    grams = []
    for sen in df:
        sent = sen.split(',')
        for word in sent:
            grams.append(word)
    return pd.Series(grams)

In [12]:
def generate_phrases(desc):
    desc = clean_text(desc)
    tagged = tokenize_and_tag(desc)
    g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
    c = training_set(g4_chunks)
    separated_chunks4 = strip_commas(c)
    phrases = pd.concat([training_set(g1_chunks),
                          training_set(g2_chunks),
                          training_set(g3_chunks),
                          separated_chunks4],
                            ignore_index = True )
    return phrases

In [13]:
extracted_POS_from_all_job_descriptions = []

# Assuming job_descriptions is a list of all your job descriptions
for desc in tqdm(job_descriptions, desc='Generating Phrases: '):
    phrases = generate_phrases(desc)
    extracted_POS_from_all_job_descriptions.append(phrases)

# Concatenate all series
all_phrases = pd.concat(extracted_POS_from_all_job_descriptions, ignore_index=True)

Generating Phrases:   0%|          | 0/3530 [00:00<?, ?it/s]

In [14]:
all_phrases.isnull().sum()

0

In [15]:
all_phrases

0                              title
1         mobile marketing executive
2                   full description
3                       appointments
4                         passionate
                     ...            
516607                        please
516608                        posted
516609                waterlooville 
516610             hampshire client 
516611                              
Length: 516612, dtype: object

In [14]:
noisy_words = list(set(noisy_words))

def preprocess_phrases(text):
    # remove leading and trailing spaces
    text = text.strip()
    # remove leading and trailing slashes
    text = text.strip('/')
    # remove noisy words
    text = ' '.join(word for word in text.split() if word not in noisy_words)
    return text

In [15]:
# Keep rows that are not in the noisy_words list
filtered_phrases = all_phrases[~all_phrases.isin(noisy_words)]

# Keep rows that don't contain special characters except '-', '/', and '.'
filtered_phrases = filtered_phrases[filtered_phrases.apply(lambda x: bool(re.match('^[a-zA-Z-/\. ]*$', x)))]

filtered_phrases = filtered_phrases.apply(lambda row: preprocess_phrases(row))

# Remove empty strings
filtered_phrases = filtered_phrases[filtered_phrases != '']

# Remove duplicates
filtered_phrases = filtered_phrases.drop_duplicates()

In [16]:
# Keep only phrases with at most 3 words
filtered_phrases = filtered_phrases[filtered_phrases.apply(lambda x: len(x.split(' ')) <= 3)]


In [21]:
#filtered_phrases.to_csv('phrases.csv',index=False) # Collect the phrases

## Training and Optimizing LSTM for classifying phrases as skill or no-skill

In [17]:
import pandas as pd
import silence_tensorflow.auto
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras_tuner.tuners import RandomSearch, BayesianOptimization
from keras_tuner.engine.hyperparameters import HyperParameters
import numpy as np
from tqdm.notebook import tqdm

import warnings

warnings.filterwarnings(action='ignore')

In [18]:
# Define custom column names
custom_columns = ['Text', 'Skills/No_Skills']

# Read the Excel file and specify the custom column names
skills_data = pd.read_excel('skills_no-skills_backup.xlsx', names=custom_columns)

In [19]:
skills_data.head()

Unnamed: 0,Text,Skills/No_Skills
0,a/b,skill
1,a/c reconciliations,skill
2,aaai qualifications,skill
3,aac,skill
4,aareon qlx,No-skill


In [20]:
# Drop rows with missing values
skills_data.dropna(inplace=True)

In [21]:
skills_data['Skills/No_Skills'].unique()

array(['skill', 'No-skill', 'No-Skill', 'Skill', 'no-skill'], dtype=object)

In [22]:
# Replace values in the 'Skills/No_Skills' column
skills_data['Skills/No_Skills'] = skills_data['Skills/No_Skills'].replace('No-skill', 'no-skill')\
.replace('Skill', 'skill')\
.replace('No-Skill','no-skill')

In [23]:
skills_data['Skills/No_Skills'].unique()

array(['skill', 'no-skill'], dtype=object)

In [24]:
def preprocess_text(text):
    # Replace '/' with space
    text = text.replace('/', ' ')

    # Remove extra spaces
    text = ' '.join(text.split())  # Remove extra spaces and replace with single space

    return text

In [25]:
# Apply the preprocessing function to the "Text" column, but keep the first 5 rows unchanged
first_five_rows = skills_data['Text'].iloc[:5]  # Extract the first five rows
rest_of_rows = skills_data['Text'].iloc[5:]  # Extract the rest of the rows

In [26]:
first_five_rows

0                    a/b
1    a/c reconciliations
2    aaai qualifications
3                    aac
4             aareon qlx
Name: Text, dtype: object

In [27]:
rest_of_rows

5                                       aat
6                                  aat acca
7                      aat close completion
8                       aat junior accounts
9                             aat qualified
                        ...                
67739                                rutter
67740                             rwi media
67741                          ryan henshaw
67742           ryanhaywardpeoplesourcecouk
67743    ryanhaywardpeoplesourcecouk source
Name: Text, Length: 67733, dtype: object

In [28]:
# Preprocess the rest of the rows
rest_of_rows = rest_of_rows.apply(preprocess_text)

In [29]:
first_five_rows

0                    a/b
1    a/c reconciliations
2    aaai qualifications
3                    aac
4             aareon qlx
Name: Text, dtype: object

In [30]:
# Combine the first five rows with the processed rest of the rows
skills_data['Text'] = pd.concat([first_five_rows, rest_of_rows])

In [16]:
# Saving the skills
skills_data[skills_data['Skills/No_Skills'] == 'skill'].to_csv('skills.csv',index=False)

In [31]:
texts = skills_data['Text'].values
labels = skills_data['Skills/No_Skills'].values

In [32]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(texts)

# Pad sequences
max_length = max([len(s.split()) for s in texts])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Encode labels
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_docs, encoded_labels, test_size=0.2, stratify=encoded_labels, random_state=42)

In [33]:
import pickle
tokenizer_information = {
    'tokenizer':tokenizer,
    'max_length':max_length
}

file_name = 'tokenizer_saved_information.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(tokenizer_information, f)

In [21]:
def LSTM_Optimization(hp):
    embedding_dim = hp.Int('embedding_dims', min_value=50, max_value=300)
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(hp.Float('Spatial_Dropout', min_value=0.1, max_value=0.3, step=0.05)))

    # Add LSTM layers dynamically based on hyperparameter search
    for i in range(hp.Int('n_layers', 1, 5)):
        model.add(LSTM(hp.Int(f'lstm_{i}_units', min_value=8, max_value=100, step=32), return_sequences=True))
        model.add(Dropout(hp.Float(f'Dropout_rate_{i}', min_value=0, max_value=0.5, step=0.1)))
    model.add(LSTM(hp.Int('layer_2_neurons', min_value=8, max_value=100, step=32)))
    model.add(Dropout(hp.Float('Dropout_rate_last', min_value=0, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    # Optimize the Adam optimizer using Keras Tuner
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])  # Set learning rate choices
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=hp_learning_rate), metrics=['accuracy'])
    return model

In [22]:
import IPython
# Define the early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True,mode='max')
reduce_lr_on_plateau = ReduceLROnPlateau(
                                                         monitor = "val_loss",
                                                         factor = 0.1,
                                                         patience = 3,
                                                         verbose = 0,
                                                         mode = "min"
                                                         )

# define a callback to clear the training outputs at the end of every training step
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

In [23]:
# Perform RandomSearch with Keras Tuner
tuner = BayesianOptimization(LSTM_Optimization,
                              objective='val_accuracy',
                              max_trials=5,
                              directory='Optimizing LSTM',
                              project_name='skill_no-skill_classify')
tuner.search(X_train, y_train, epochs=100, validation_data=(X_test, y_test),
                          batch_size=512,
                          callbacks = [ClearTrainingOutput(),
                          early_stopping,
                          reduce_lr_on_plateau])

Trial 5 Complete [00h 01m 53s]
val_accuracy: 0.872453510761261

Best val_accuracy So Far: 0.8851491212844849
Total elapsed time: 00h 08m 41s


In [24]:
best_lstm_model = tuner.get_best_models(num_models=1)[0]

In [25]:
best_lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 190)           2376710   
                                                                 
 spatial_dropout1d (SpatialD  (None, 16, 190)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 16, 40)            36960     
                                                                 
 dropout (Dropout)           (None, 16, 40)            0         
                                                                 
 lstm_1 (LSTM)               (None, 16, 8)             1568      
                                                                 
 dropout_1 (Dropout)         (None, 16, 8)             0         
                                                        

In [26]:
# Evaluate the model
loss, accuracy = best_lstm_model.evaluate(X_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.8851


In [40]:
cleaned_job_descriptions = []

for desc in tqdm(job_descriptions, desc='Generating Phrases: '):
    desc = clean_text(desc)
    cleaned_job_descriptions.append(desc)

Generating Phrases:   0%|          | 0/3530 [00:00<?, ?it/s]

In [34]:
def extract_skills_using_lstm(trained_lstm_model, raw_job_description, tokenizer, max_length, noisy_words_path = 'noisy_words.txt'):
    # Tokenize and tag the new job description
    tagged_description = tokenize_and_tag(raw_job_description)

    # Extract parts of speech
    g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged_description)

    c = training_set(g4_chunks)
    separated_chunks4 = strip_commas(c)
    phrases = pd.concat([training_set(g1_chunks),
                        training_set(g2_chunks),
                        training_set(g3_chunks),
                        separated_chunks4],
                            ignore_index = True)
    
    # Open the text file and read the lines into a list
    with open(noisy_words_path, 'r') as f:
        words = f.readlines()

    # Remove any newline characters from each word
    noisy_words = [word.strip() for word in words]
    
    noisy_words = list(set(noisy_words))

    def preprocess_phrases(text):
        # remove leading and trailing spaces
        text = text.strip()
        # remove leading and trailing slashes
        text = text.strip('/')
        # remove noisy words
        text = ' '.join(word for word in text.split() if word not in noisy_words)
        return text
    
    # Keep rows that are not in the noisy_words list
    filtered_phrases = phrases[~phrases.isin(noisy_words)]

    # Keep rows that don't contain special characters except '-', '/', and '.'
    filtered_phrases = filtered_phrases[filtered_phrases.apply(lambda x: bool(re.match('^[a-zA-Z-/\. ]*$', x)))]

    filtered_phrases = filtered_phrases.apply(lambda row: preprocess_phrases(row))

    # Remove empty strings
    filtered_phrases = filtered_phrases[filtered_phrases != '']

    # Remove duplicates
    filtered_phrases = filtered_phrases.drop_duplicates()

    # Keep only phrases with at most 3 words
    filtered_phrases = filtered_phrases[filtered_phrases.apply(lambda x: len(x.split(' ')) <= 3)]

    # create array of text
    text = np.array(filtered_phrases)

    # convert the array of text into sequence
    encoded_docs = tokenizer.texts_to_sequences(text)

    # pad the text sequence to make them equal length
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

    # perform the prediction using trained lstm model
    predictions = (trained_lstm_model.predict(padded_docs) > 0.5).astype('int32')

    # perform the prediction using lstm
    out = pd.DataFrame({'Phrase':filtered_phrases, 'Class':predictions.ravel()})

    # extract the skill from the predicted classification of phrases
    skills = out.loc[out['Class'] == 1]

    return skills['Phrase'].tolist()

In [44]:
skill_extraction_steps_information = {
    'clean_text':clean_text,
    'tokenize_and_tag':tokenize_and_tag,
    'extract_POS':extract_POS,
    'training_set':training_set,
    'tokenizer':tokenizer,
    'pad_sequences':pad_sequences,
    'max_length':max_length,
    'extract_skills_using_lstm':extract_skills_using_lstm
}

file_name = 'skill_extraction_steps_information.pkl'
with open(file_name, 'wb') as f:
    pickle.dump(skill_extraction_steps_information, f)

In [67]:
extract_skills_using_lstm(trained_lstm_model=best_lstm_model,
                          raw_job_description=cleaned_job_descriptions[2050],
                          tokenizer=tokenizer,
                          max_length=max_length)



['net/sql',
 'cms',
 'content management cms',
 'social networking',
 'new product features',
 'aspnet c',
 'html',
 'xhtml',
 'asp net developer',
 'sql server',
 'javascript',
 'css',
 'xml',
 'ajax',
 'asp net programmer',
 'programmer',
 'software engineer job',
 'net framework',
 'software developer',
 'intranet cms',
 'networking',
 'c']

## Saving the Model for later use

In [69]:
## Saving the trained LSTM model
best_lstm_model.save('trained_lstm_model_for_extracting_skills.hdf5')

In [None]:
#