In [14]:
import os
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support,confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_words]

    return ' '.join(stemmed)


In [15]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[self.column]
        else:
            return X

In [16]:
csv_path = "./Resume1.csv"
df = pd.read_csv(csv_path)

In [17]:
df.shape

(2484, 4)

In [18]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [19]:
# Create a pipeline for text classification
text_clf = Pipeline([
    ('preprocessor', ColumnExtractor('Resume_str')),
    ('vectorizer', TfidfVectorizer(preprocessor=preprocess_text)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [20]:
# Train the model
text_clf.fit(train_df['Resume_str'], train_df['Category'])

# Predict on the test set
predictions = text_clf.predict(test_df['Resume_str'])

In [21]:
accuracy = accuracy_score(test_df['Category'], predictions)
conf_matrix_result = confusion_matrix(test_df['Category'], predictions)
print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix_result}')
print('\nClassification Report:\n', classification_report(test_df['Category'], predictions, zero_division=1))

# Calculate precision, recall, and f1 with macro average
precision, recall, f1, _ = precision_recall_fscore_support(test_df['Category'], predictions, average='macro', zero_division=1)

# Print precision, recall, and f1
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.61
Confusion Matrix:
[[26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0 20  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  4  1  0  1  1  2]
 [ 1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  1  1  0  0  0  1  1  0  2]
 [ 0  0  0  9  1  0  0  2  0  0  0  1  0  1  0  2  0  0  0  0  0  0  2  2]
 [ 0  0  0  1  2  0  0  0  0  1  1  0  0  1  0  2  1  0  0  1  1  0  1  6]
 [ 0  1  1  0  1  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  0  0]
 [ 1  0  0  0  1  0  0 14  0  0  0  0  0  0  0  1  3  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  1  0 13  0  0  0  0  1  1  1  0  2  1  2  3  2  0]
 [ 0  0  0  0  0  0  2  0  0  1 17  0  0  2  0  0  0  0  1  0  0  0  0  1]
 [ 0  1  0  0  1  0  0  0  0  2  0 26  0  2  0  0  0  1  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  1  0  1  0  1  4  0  1  1  1  0  2  1  6  1

In [22]:
df_orig = pd.read_csv("./Resume.csv")
df = df_orig.copy(deep = True)

In [23]:
# 1. Input a Job Description
job_description = """
Job Title: IT Systems Administrator

Company: XYZ Corporation

Location: Anytown, USA

Job Type: Full-time

Responsibilities:
- Manage and maintain the organization's IT infrastructure, including servers, networks, and systems.
- Install, configure, and troubleshoot hardware and software components.
- Provide technical support to end-users, addressing IT-related issues and inquiries.
- Implement and monitor cybersecurity measures to ensure data integrity and system security.
- Collaborate with other IT professionals to develop and implement IT strategies and solutions.
- Conduct regular system audits and perform necessary upgrades and patches.
- Stay informed about emerging technologies and trends in the IT industry.

Qualifications:
- Bachelor's degree in Computer Science or related field.
- Proven experience as an IT Systems Administrator or similar role.
- Proficiency in system administration, network protocols, and cybersecurity best practices.
- Strong problem-solving skills and attention to detail.
- Excellent communication and interpersonal skills.

Benefits:
- Competitive salary and benefits package.
- Opportunities for professional development and training.
- Health insurance, retirement plans, and other employee perks.
- Positive and collaborative work environment.

How to Apply:
Interested candidates should submit their resume and cover letter to careers@xyzcorporation.com with the subject line "IT Systems Administrator Application."

"""
vectorizer = TfidfVectorizer()
job_desc_vector = vectorizer.fit_transform([job_description])
# 2. Preprocess the Job Description
preprocessed_job_description = preprocess_text(job_description)

# 3. Use the Trained Model for Prediction
predicted_category = text_clf.predict([preprocessed_job_description])[0]
df['Relevance Score'] = 0.0
# 4. Retrieve Relevant Resumes
relevant_resumes = df[df['Category'] == predicted_category]['Resume_str']

In [24]:
predicted_category

'INFORMATION-TECHNOLOGY'

In [25]:
df.shape

(3446, 170)

In [26]:
# 5. Display or Output Relevant Resumes
for idx, resume in enumerate(relevant_resumes):
    resume_vector = vectorizer.transform([resume])
    similarity_score = cosine_similarity(job_desc_vector, resume_vector)[0][0]
    df.loc[df['Resume_str'] == resume, 'Relevance Score'] = similarity_score

In [31]:
print(df[['Resume_str', 'Relevance Score']].sort_values(by='Relevance Score', ascending=False))

                                             Resume_str  Relevance Score
283            INFORMATION TECHNOLOGY CONSULTANT    ...         0.871372
236            INFORMATION TECHNOLOGY SPECIALIST/SYS...         0.867652
238            DIRECTOR OF INFORMATION TECHNOLOGY   ...         0.866165
253            DIRECTOR OF INFORMATION TECHNOLOGY   ...         0.864151
337            IT MANAGER             Highlights    ...         0.861474
...                                                 ...              ...
1231           CONSULTANT         Summary     Over 2...         0.000000
1232           IT CONSULTANT           Professional ...         0.000000
1233           CONSULTANT           Executive Profil...         0.000000
1234           CONSULTANT       Executive Profile   ...         0.000000
3445           STOREKEEPER II       Professional Sum...         0.000000

[3446 rows x 2 columns]


In [28]:
relevant_resumes = df[df['Category'] == predicted_category].copy()
relevant_resumes = relevant_resumes.sort_values(by='Relevance Score', ascending=False)
print(f"Relevant Resumes for the job description in the category '{predicted_category}':")
for idx, row in relevant_resumes.iterrows():
    print(f"Relevance Score: {row['Relevance Score']:.2f}")
    print(f"Resume:\t{row['Resume_str']}\n{'='*50}\n")

Relevant Resumes for the job description in the category 'INFORMATION-TECHNOLOGY':
Relevance Score: 0.87
Resume:	         INFORMATION TECHNOLOGY CONSULTANT           Career Overview     Accomplished information technology professional with over 18 years of diverse technology, process analysis, project management, and information management experience.  Proven ability to successfully implement technology solutions, stay within time and budget constraints, and improve efficiency through proper risk management, task coordination, and resource utilization. Core Competencies Project Management Systems / Network Reporting Proposal Development Web Design & Development SOP & Policy Writing Systems & Process Consulting Document Management Technically-advanced information technology specialist successful in software administration and data communications.Experienced Computer Systems Analyst with diverse industry experience in banking, healthcare, insurance and government. Professional expertise 