In [3]:
!pip install gensim

import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from keras.models import Sequential
from keras.layers import Dense

# Load datasets
jd_df = pd.read_csv('/content/DataScientist.csv')
resume_df = pd.read_csv('/content/UpdatedResumeDataSet.csv')

# Preprocessing text
import re
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

jd_df['Job Description'] = jd_df['Job Description'].apply(preprocess)
resume_df['Resume'] = resume_df['Resume'].apply(preprocess)

# Combine and tag documents
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(resume_df['Resume'])]
jd_documents = [TaggedDocument(doc.split(), ['jd_{}'.format(i)]) for i, doc in enumerate(jd_df['Job Description'])]

# Train Doc2Vec model
model_d2v = Doc2Vec(documents + jd_documents, vector_size=50, window=2, min_count=1, workers=4)




In [4]:
resume_vectors = [model_d2v.infer_vector(doc.words) for doc in documents]
jd_vectors = [model_d2v.infer_vector(doc.words) for doc in jd_documents]


In [40]:
# Assume first job description matches with the first 100 resumes (simplification)
labels = [1 if i < 10 else 0 for i in range(len(resume_vectors))]  # Simplified example

# Split the data
X_train, X_test, y_train, y_test = train_test_split(resume_vectors, labels, test_size=0.3, random_state=42)


In [47]:
import numpy as np

# Convert lists to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

# Define the enhanced neural network model
model = Sequential()
model.add(Dense(256, input_dim=50, activation='relu'))  # Increased the number of neurons
model.add(BatchNormalization())  # Batch Normalization layer
model.add(Dropout(0.3))  # Dropout layer with 30% rate

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))  # Output layer remains the same

# Compile the model with advanced optimizer
optimizer = Adam(learning_rate=0.001)  # Customizable learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# Train the model
model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test))

# Evaluate the model
train_loss, train_acc = model.evaluate(X_train, y_train)
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Train Accuracy:', train_acc)
print('Test Accuracy:', test_acc)






Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train Accuracy: 0.9881129264831543
Test Accuracy: 0.9653978943824768


In [48]:
def get_top_resumes_for_jd(jd_text, top_n=5):
    # Preprocess the JD text
    jd_text_clean = preprocess(jd_text)

    # Convert JD text to a vector
    jd_vector = model_d2v.infer_vector(jd_text_clean.split()).reshape(1, -1)

    # Create a matrix of JD vector repeated for each resume in X_test
    jd_matrix = np.repeat(jd_vector, len(X_test), axis=0)

    # Compute the model's predictions for these vectors
    similarity_scores = model.predict(jd_matrix).flatten()

    # Get the indices of the top matching resumes
    top_indices = np.argsort(similarity_scores)[-top_n:][::-1]

    # Retrieve the top matching resumes
    return resume_df.iloc[top_indices]




In [49]:
# Example job description input
input_jd = jd_df['Job Description'].iloc[1]
print(sample_jd)




at noom we use scientifically proven methods to help our users create healthier lifestyles and manage important conditions like type ii diabetes obesity and hypertension our engineering team is at the forefront of this challenge solving complex technical and ux problems on our mobile apps that center around habits behavior and lifestyle we are looking for a data scientist to join our data team and help us ensure that we apply the best approaches to data analysis and research artificial intelligence and machine learning what you ll like about us we work on problems that affect the lives of real people our users depend on us to make positive changes to their health and their lives we base our work on scientifically proven peer reviewed methodologies that are designed by medical professionals we are a data driven company through and through we re a respectful diverse and dynamic environment in which engineering is a first class citizen and where you ll be able to work on a variety of inte

In [50]:
top_matching_resumes = get_top_resumes_for_jd(input_jd)
print(top_matching_resumes)

               Category                                             Resume
288  Health and fitness  education details may 2014 diploma nutrition e...
143       Web Designing  education details january 2016 b sc informatio...
91             Advocate  skills legal writing efficient researcher lega...
92             Advocate  good grasping quality and skillful work educat...
93             Advocate  â hard working â quick learnereducation detail...
