# `Resume Screening Tool`
####`Author -Tanmay Khandelwal`

In [None]:
 #Mounting Gdrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cd drive/MyDrive/

In [None]:
import pandas as pd 
dataset_df=pd.read_csv('drive/MyDrive/ResumeDataSet.csv')

In [None]:
#Visualize the dataset
dataset_df.head(10) 

In [None]:
#Unique value count in category
dataset_df.Category.value_counts() 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
sns.countplot(y="Category", data=dataset_df)

In [None]:
import re
#Helper Function to clean the dataset
def cleanResumeText(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
dataset_df['cleaned_resume'] = ''
dataset_df['cleaned_resume'] = dataset_df.Resume.apply(lambda x: cleanResumeText(x))
dataset_df.head()

In [None]:
from sklearn.model_selection import train_test_split
#Split into train test (80% used for training and 20% used for testing)
train_df, test_df = train_test_split(dataset_df, test_size=0.2)

In [None]:
train_df.shape[0], test_df.shape[0]

In [None]:
# Convert abstract text lines into lists 
train_sentences = train_df["cleaned_resume"].tolist()
test_sentences = test_df["cleaned_resume"].tolist()
len(train_sentences), len(test_sentences)

In [None]:
train_sentences[:10]

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_df["Category"].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df["Category"].to_numpy().reshape(-1, 1))

# Check what training labels look like
train_labels_one_hot

In [None]:
train_sentences[:10]

In [None]:
# Download pretrained TensorFlow Hub USE
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [None]:
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = tf_hub_embedding_layer(inputs) # tokenize text and create embedding
x = layers.Dense(128, activation="relu")(pretrained_embedding) # add a fully connected layer on top of the embedding
outputs = layers.Dense(25, activation="softmax")(x) # create the output layer
model = tf.keras.Model(inputs=inputs,
                        outputs=outputs)

# Compile the model
model.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model.summary()


In [None]:
train_sentences = np.asarray(train_sentences)
train_labels_one_hot = np.asarray(train_labels_one_hot)
train_sentences[:5], train_labels_one_hot[:5]


In [None]:
tf.config.experimental_run_functions_eagerly(True)
# Fit feature extractor model for 10 epochs
model.fit(train_sentences,train_labels_one_hot,epochs=10)

In [None]:
test_sentences = np.asarray(test_sentences)
test_labels_one_hot = np.asarray(test_labels_one_hot)
model.evaluate(test_sentences,test_labels_one_hot)

In [None]:
# Make predictions with feature extraction model
model_pred_probs = model.predict(test_sentences)
model_pred_probs

In [None]:
model_preds = tf.argmax(model_pred_probs, axis=1)
model_preds


In [None]:
#Saving model
#model.save("model.h5") 


In [None]:
#Predicting on Resumes at random
def predict_on_resume(model, sentence):
  pred_prob = model.predict([sentence])
  pred_label = tf.argmax(pred_prob, axis=1)
  return pred_label

In [None]:
test_resume = "Education Details August 2010 to May 2017 BE Electronics Communication Jabalpur Madhya Pradesh Takshshila institute of technology Java developer Skill Details Java Javascript Exprience 6 monthsCompany Details company Wab It Softwere Pvt Ltd description Jr Java Developer"

In [None]:
test_abstract_preds = predict_on_resume(model=model, # use the USE model
                    sentence=test_resume)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["Category"].to_numpy())
test_abstract_pred_classes = [label_encoder.classes_[i] for i in test_abstract_preds]
print(f'Predicted Category for Resume: {test_abstract_pred_classes[0]}')