In [2]:
import pandas as pd

clean_df = pd.read_csv('resume_pr/clean_resume_data.csv')
clean_df.shape

(2484, 3)

In [3]:
clean_df.head()

Unnamed: 0,ID,Category,Feature
0,16852973,HR,hr administrator marketing associate hr admini...
1,22323967,HR,hr specialist hr operations summary media prof...
2,33176873,HR,hr director summary years experience recruitin...
3,27018550,HR,hr specialist summary dedicated driven dynamic...
4,17812897,HR,hr manager skill highlights hr skills hr depar...


In [4]:
clean_df['Category'].value_counts()

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ADVOCATE                  118
CHEF                      118
ENGINEERING               118
ACCOUNTANT                118
FINANCE                   118
FITNESS                   117
AVIATION                  117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

# Balance Dataset

In [5]:
from sklearn.utils import resample

max_count = clean_df['Category'].value_counts().max()

balanced_data = []

for category in clean_df['Category'].unique():
    category_data = clean_df[clean_df['Category'] == category]

    if len(category_data) < max_count:
        balanced_category_data = resample(category_data , replace=True , n_samples=max_count , random_state=42)
    else:
        balanced_category_data = resample(category_data , replace=False , n_samples=max_count , random_state=42)
    
    balanced_data.append(balanced_category_data)


balanced_df = pd.concat(balanced_data)
balanced_df.shape




(2880, 3)

In [6]:
balanced_df['Category'].value_counts()

Category
HR                        120
DESIGNER                  120
INFORMATION-TECHNOLOGY    120
TEACHER                   120
ADVOCATE                  120
BUSINESS-DEVELOPMENT      120
HEALTHCARE                120
FITNESS                   120
AGRICULTURE               120
BPO                       120
SALES                     120
CONSULTANT                120
DIGITAL-MEDIA             120
AUTOMOBILE                120
CHEF                      120
FINANCE                   120
APPAREL                   120
ENGINEERING               120
ACCOUNTANT                120
CONSTRUCTION              120
PUBLIC-RELATIONS          120
BANKING                   120
ARTS                      120
AVIATION                  120
Name: count, dtype: int64

In [7]:
balanced_df.isnull().sum()

ID          0
Category    0
Feature     1
dtype: int64

In [8]:
balanced_df[balanced_df['Feature'].isnull()]

Unnamed: 0,ID,Category,Feature
656,12632728,BUSINESS-DEVELOPMENT,


In [9]:
balanced_df.dropna(inplace=True)

# Train test split

In [10]:
from sklearn.model_selection import train_test_split
X = balanced_df['Feature']
y = balanced_df['Category']
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

In [11]:
X_train.count()

np.int64(2303)

# Encoding Text

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train__tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test__tfidf = tfidf_vectorizer.transform(X_test)

X_test__tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 179711 stored elements and shape (576, 29780)>

# Train Random Forest Classifier

In [13]:
X_test__tfidf.shape

(576, 29780)

In [14]:
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix
from sklearn.ensemble._forest import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train__tfidf , y_train)

y_pred = rf_classifier.predict(X_test__tfidf)
accuracy = accuracy_score(y_test , y_pred)
print("Accuracy:" , accuracy)

Accuracy: 0.8541666666666666


In [15]:
print(classification_report(y_test , y_pred))

                        precision    recall  f1-score   support

            ACCOUNTANT       0.80      0.95      0.87        21
              ADVOCATE       0.96      0.76      0.85        29
           AGRICULTURE       1.00      0.78      0.88        23
               APPAREL       0.82      0.86      0.84        21
                  ARTS       0.93      0.59      0.72        22
            AUTOMOBILE       1.00      1.00      1.00        19
              AVIATION       0.94      1.00      0.97        30
               BANKING       0.94      0.74      0.83        23
                   BPO       1.00      1.00      1.00        15
  BUSINESS-DEVELOPMENT       0.50      0.56      0.53        18
                  CHEF       0.96      0.96      0.96        28
          CONSTRUCTION       0.89      0.96      0.92        25
            CONSULTANT       1.00      0.65      0.78        31
              DESIGNER       0.93      0.96      0.95        28
         DIGITAL-MEDIA       0.80      

In [16]:
confusion_matrix = confusion_matrix(y_test , y_pred)
# print(confusion_matrix)

In [17]:
import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText

  cleanText = re.sub('http\S+\s', ' ', txt)
  cleanText = re.sub('#\S+\s', ' ', cleanText)
  cleanText = re.sub('@\S+', '  ', cleanText)
  cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
  cleanText = re.sub('\s+', ' ', cleanText)


In [18]:
def predict_category(resume_text):
    # resume_text = cleanResume(resume_text)
    resume_tfidf = tfidf_vectorizer.transform([resume_text])
    predicted_category = rf_classifier.predict(resume_tfidf)[0]
    return predicted_category

In [19]:
text = "Highly motivated and results-oriented Human Resources professional with [X] years of experience in employee relations, talent acquisition, HR policies, and organizational development. Seeking to leverage my expertise in [specific skills or focus areas] to contribute to [Company Name]'s growth and foster a positive and productive work environment."
print(predict_category(text))

HR


In [20]:
resume_file = """Objective:
Dedicated and results-oriented Banking professional with a strong background in financial analysis and customer service seeking opportunities to contribute to a reputable financial institution. Eager to leverage expertise in risk management, investment strategies, and relationship building to drive business growth and client satisfaction.

Education:
- Bachelor of Business Administration in Finance, XYZ University, GPA: 3.8/4.0
- Certified Financial Analyst (CFA) Level I Candidate

Skills:
- Proficient in financial modeling and analysis using Excel, Bloomberg Terminal, and other financial software
- Extensive knowledge of banking products and services, including loans, mortgages, and investment products
- Strong understanding of regulatory compliance and risk management practices in the banking industry
- Excellent communication and interpersonal skills, with a focus on building rapport with clients and colleagues
- Ability to work efficiently under pressure and adapt to changing market conditions

Experience:
Financial Analyst | ABC Bank
- Conducted financial analysis and risk assessment for corporate clients, including credit analysis, financial statement analysis, and cash flow modeling
- Developed customized financial solutions to meet clients' needs and objectives, resulting in increased revenue and client retention
- Collaborated with cross-functional teams to identify new business opportunities and optimize existing processes

Customer Service Representative | DEF Bank
- Provided exceptional customer service to bank clients, addressing inquiries, resolving issues, and promoting banking products and services
- Processed transactions accurately and efficiently, including deposits, withdrawals, and account transfers
- Educated customers on various banking products and services, helping them make informed financial decisions

Internship | GHI Investments
- Assisted portfolio managers with investment research and analysis, including industry and company-specific research, financial modeling, and performance analysis
- Prepared investment presentations and reports for clients, highlighting investment opportunities and performance metrics
- Conducted market research and analysis to identify trends and opportunities in the financial markets

Certifications:
- Certified Financial Planner (CFP)
- Series 7 and Series 63 Securities Licenses

Languages:
- English (Native)
- Spanish (Proficient)

"""
predicted_category = predict_category(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: BANKING


In [21]:
resume_file = """
                Kshitij Agarwal
kshitijagarwal2808@gmail.com
github.com/kshitijagar
linkedin.com/in/kshitij-agar28
Roll No.: PES1UG21CS292
B.Tech (CSE) — PES University, Bangalore
Education
Degree/Certificate
B.Tech (Computer Science)
Senior Secondary
Secondary
Institute/Board
PES University
CBSE Board
CBSE Board
CGPA/Percentage
8.86 (Current)
94%
95.2%
Year
2021-Present
2021
2019
Skills
Programming: Python, C, Java, JavaScript, React.js, Node.js, React Native, Solidity Tools: Docker, GitHub,
Hadoop, Kaggle, Visual Studio Code Databases: MySQL, MongoDB, Firebase Operating Systems: Windows,
Linux
Key Courses
Mathematics: Linear Algebra, Statistics in Data Science, Data Analytics
Product Management: Software Engineering
Computer Science: Data Structures and Algorithms, Operating Systems, Computer Networking, Automata and Formal
Languages, Big Data, Information Security, Compiler Design, Blockchain
Experience
Summer Student - Ciena Corp
Jun 2024 - Aug 2024
• Collaborated with senior engineers to identify, troubleshoot, and resolve bugs in automation scripts for Packet Network
QA.
• Gained insights into networking protocols and operational mechanics of switches and routers.
• Contributed to automating logical port stats test cases, reducing test execution time by 93% (from 70 minutes to 5
minutes).
Projects
Blockchain Voting dApp
Mar 2024 - Apr 2024
• Developed a decentralized voting application using Solidity smart contracts and React.js frontend.
• Implemented secure voting mechanisms and blockchain integration for transparent and tamper-proof elections.
• Tech Stack: Solidity, React.js, Ethereum, Web3.js
MediTrack
Feb 2024 GitHub Link
• Developed a mobile app using Figma, React Native, and Firebase to track local disease spread.
• Integrated multiple APIs to provide real-time disease data.
• Tech Stack: Figma, React Native, Firebase
YKraft - Yet Another Kraft
Oct 2023 - Nov 2023 GitHub Link
• Developed an HTTP implementation of Kraft with essential functionalities.
• Enhanced system with metadata and log storage for efficient recovery from server failures.
• Tech Stack: Python, HTTP protocols
Sports Department Management System
Sept 2023 - Nov 2023 GitHub Link
• Backend: Developed a Sports Department Management System for the university using MySQL and Django providing
an intuitive interface.
• Integrated frontend using HTML and CSS to develop a Sports Department Management System.
• Tech Stack: MySQL, Django, HTML, CSS
Student Elective Manager
May 2023 - July 2023 Project Link
• Played a key role in making the app framework and front-end for improved user-friendliness.
• Contributed to the development of the project’s Firebase database and hosting the website on the web.
• Tech Stack: React.js, Firebase
SecureCipher: Data Encryption and Decryption Tool
Jan 2022 - Feb 2022 GitHub Link
• The project leverages a combination of Python libraries, advanced cryptographic techniques, and user interface (UI)
tools to create an intuitive and user-friendly Python application.
• Tech Stack: Python, Cryptography libraries, UI tools
Positions of Responsibility
Head of Events and Operations - IEEE CS Club, PES University
• Organized a guest lecture attended by over 400 participants.
• Led organization of a 24-hour hackathon with participants from vario
                """

predicted_category = predict_category(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: INFORMATION-TECHNOLOGY


In [22]:
import pickle
pickle.dump(rf_classifier,open('models/rf_classifier_categorization.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('models/tfidf_vectorizer_categorization.pkl','wb'))