In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Loading dataset:
df = pd.read_csv("UpdatedResumeDataSet.csv")
print(df.shape)
df.head()

(962, 2)


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [3]:
df.isnull().sum()

Category    0
Resume      0
dtype: int64

In [4]:
df["Category"].value_counts()

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64

In [5]:
df["Category"].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

# Encoding:

In [6]:
labels = {
    'Data Science':0, 'HR':1, 'Advocate':2, 'Arts':3, 'Web Designing':4,
       'Mechanical Engineer':5, 'Sales':6, 'Health and fitness':7,
       'Civil Engineer':8, 'Java Developer':9, 'Business Analyst':10,
       'SAP Developer':11, 'Automation Testing':12, 'Electrical Engineering':13,
       'Operations Manager':14, 'Python Developer':15, 'DevOps Engineer':16,
       'Network Security Engineer':17, 'PMO':18, 'Database':19, 'Hadoop':20,
       'ETL Developer':21, 'DotNet Developer':22, 'Blockchain':23, 'Testing':24
}

In [7]:
df['category_num'] = df.Category.map(labels)

In [8]:
df.sample(5)

Unnamed: 0,Category,Resume,category_num
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,24
147,Web Designing,"Technical Skills Web Technologies: Angular JS,...",4
693,PMO,Skills Exceptional communication and networkin...,18
411,Business Analyst,TECHNOLOGICAL SKILLS â¦ Knowledge of Computer...,10
732,Database,Technical Expertise Operating Systems Microsof...,19


# cleaning Text 

In [15]:
import sklearn

In [17]:
import spacy 
nlp = spacy.load("en_core_web_lg")

In [18]:
def cleanText(text):
    doc = nlp(text)

    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [19]:
df["clean_text"] = df["Resume"].apply(cleanText)

In [20]:
df.head()

Unnamed: 0,Category,Resume,category_num,clean_text
0,Data Science,Skills * Programming Languages: Python (pandas...,0,skill programming language Python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,0,education detail \r\n 2013 2017 B.E UIT RGP...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",0,area Interest Deep Learning Control System Des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,0,skill â¢ r â¢ Python â¢ SAP hana â¢ tablea...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",0,education Details \r\n MCA YMCAUST Farid...


# Vectorization
 i am using TFIDF Vectorizer here

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [109]:
tfidf.fit(df["clean_text"])

In [110]:
textVector = tfidf.transform(df["clean_text"])

# Splitting train test data

In [111]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(textVector,df.category_num,test_size=0.2,random_state=42)

# Model Building:

In [112]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train,y_train)

In [113]:
# evaluation

In [114]:
y_pred = model.predict(x_test)

In [115]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         6
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         8
           6       1.00      1.00      1.00         8
           7       1.00      1.00      1.00         7
           8       1.00      1.00      1.00         9
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         7
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         6
          14       0.92      1.00      0.96        12
          15       1.00      1.00      1.00        10
          16       1.00      0.93      0.96        14
          17       1.00    

In [107]:
# our model is 99% accurate

# Exporting Model

In [123]:
import pickle

pickle.dump(tfidf,open('tfidf.pkl','wb'))
pickle.dump(model,open('model.pkl','wb'))

# Testing

In [125]:
myresume = '''
    Summary: Highly motivated and experienced data scientist with a strong background in machine learning, data analysis, and visualization. Proficient in programming languages such as Python, R, and SQL, with expertise in data visualization tools like Tableau and PowerBI. Proven track record of delivering high-impact projects that drive business results.

Education:

Bachelor’s/Master’s Degree in Computer Science, Statistics, or related field
Technical Skills:

Programming languages: Python, R, SQL
Data visualization tools: Tableau, PowerBI
Machine learning algorithms: scikit-learn, TensorFlow
Data analysis tools: pandas, NumPy
Operating Systems: Windows, Linux
Cloud platforms: AWS, Google Cloud
Work Experience:

Senior Data Scientist, XYZ Corporation (2020-Present)
Led a team of junior data scientists to develop and deploy machine learning models that improved customer retention by 25%
Designed and implemented data visualization dashboards using Tableau to communicate insights to stakeholders
Collaborated with cross-functional teams to develop data-driven solutions that drove business results
Data Scientist, ABC Startups (2018-2020)
Developed and deployed predictive models using scikit-learn and TensorFlow to improve sales forecasting by 15%
Created data visualizations using PowerBI to communicate insights to stakeholders
Worked with data engineers to design and implement data pipelines using AWS and Google Cloud
Projects:

Customer Retention Analysis: Developed a machine learning model using scikit-learn to predict customer churn and implemented a retention strategy that improved customer retention by 25%
Sales Forecasting: Developed a predictive model using TensorFlow to improve sales forecasting by 15% and implemented a sales strategy that increased revenue by 10%
Data Visualization Dashboard: Designed and implemented a data visualization dashboard using Tableau to communicate insights to stakeholders and improve business decision-making
Achievements:

Improved customer retention by 25% through data-driven insights and machine learning models
Increased sales revenue by 10% through data-driven insights and predictive modeling
Developed and deployed data visualization dashboards using Tableau to communicate insights to stakeholders
Certifications:

Certified Data Scientist (CDS)
Certified Analytics Professional (CAP)
'''

In [128]:
import pickle

#load the trained model:
model = pickle.load(open('model.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

# clean the input text:
cleaned_resume = cleanText(myresume)

#transform the cleaned text using vectorizer
input_features = tfidf.transform([cleaned_resume])

#classify the resume:
prediction_id = model.predict(input_features)[0]

print(prediction_id)

0


In [129]:
key_list = list(labels.keys())
val_list = list(labels.values())
def category_name(prediction_id):
    category_name = val_list.index(prediction_id)
    print(key_list[category_name])

In [130]:
category_name(prediction_id)

Data Science
