### Import Pandas to load the dataset and check the data

In [5]:
import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with the actual path)
df = pd.read_csv("/content/Resume.csv", on_bad_lines='skip', engine='python')



In [3]:
df.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


Define the variables X and Y to define the input and the output


In [6]:
X = df['Resume_str']  # The resume text column
y = df['Category']  # The job role column

## By importing train and test split from sklearn lib we will split the whole dataset into training and testing dataset


In [7]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer #to convert the data into numerical data

tfidf = TfidfVectorizer(stop_words='english', max_features=5000) #removal of stopwords

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train) # sparse matrix is made

# Transform the test data (using the same TF-IDF model)
X_test_tfidf = tfidf.transform(X_test) #each row is resume and the columns are the text


### Logistic Regression model to calculate the accuracy and comparitively pick the efficient model to use

In [10]:
from sklearn.linear_model import LogisticRegression #using the logistic model for textual classification

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_tfidf, y_train) #fitt the model


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Predict the job roles for the test data
y_pred = model.predict(X_test_tfidf)

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Print classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.6789473684210526
                        precision    recall  f1-score   support

            ACCOUNTANT       0.83      0.91      0.87        22
              ADVOCATE       0.52      0.64      0.57        25
           AGRICULTURE       1.00      0.40      0.57        15
               APPAREL       0.92      0.50      0.65        24
            AUTOMOBILE       0.00      0.00      0.00        10
                   BPO       0.00      0.00      0.00         6
  BUSINESS-DEVELOPMENT       0.59      0.68      0.63        25
                  CHEF       0.90      0.83      0.86        23
            CONSULTANT       0.53      0.32      0.40        25
              DESIGNER       0.82      0.70      0.76        20
         DIGITAL-MEDIA       0.72      0.81      0.76        16
           ENGINEERING       0.63      0.77      0.69        22
               FINANCE       0.82      0.88      0.85        26
               FITNESS       0.70      0.58      0.64        12
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
new_resume = ["trainer."] #law, nursing, java, python, trainer

# Transform the new resume using the trained TF-IDF vectorizer
new_resume_tfidf = tfidf.transform(new_resume)

# Make a predictionthan
predicted_role = model.predict(new_resume_tfidf)

# Output the predicted job role
print(f"Predicted role: {predicted_role[0]}")


Predicted role: FITNESS


### Using naive bayes to compare the accuracy


In [98]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Train the model with the transformed training data
nb_model.fit(X_train_tfidf, y_train)

# Predict the job roles for the test data
y_pred_nb = nb_model.predict(X_test_tfidf)

# Print accuracy
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb)}")

# Print classification report
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.5653923541247485
                        precision    recall  f1-score   support

            ACCOUNTANT       0.79      0.90      0.84        29
              ADVOCATE       0.57      0.40      0.47        30
           AGRICULTURE       1.00      0.12      0.22         8
               APPAREL       0.86      0.30      0.44        20
                  ARTS       0.00      0.00      0.00        18
            AUTOMOBILE       0.00      0.00      0.00         6
              AVIATION       0.67      0.76      0.71        21
               BANKING       0.68      0.57      0.62        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.50      0.67      0.57        27
                  CHEF       0.81      0.71      0.76        24
          CONSTRUCTION       0.88      0.65      0.75        34
            CONSULTANT       1.00      0.05      0.10        20
              DESIGNER       0.78      0.74      0.76        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## using random forest classification to know the accuracy


In [99]:
# using random forest classification to know the accuracy
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}")


Random Forest Accuracy: 0.6780684104627767


In [101]:
# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 67.81%
                        precision    recall  f1-score   support

            ACCOUNTANT       0.73      0.93      0.82        29
              ADVOCATE       0.86      0.80      0.83        30
           AGRICULTURE       0.75      0.38      0.50         8
               APPAREL       0.65      0.55      0.59        20
                  ARTS       0.40      0.22      0.29        18
            AUTOMOBILE       0.00      0.00      0.00         6
              AVIATION       0.79      0.90      0.84        21
               BANKING       0.72      0.57      0.63        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.76      0.48      0.59        27
                  CHEF       0.86      0.75      0.80        24
          CONSTRUCTION       0.90      0.82      0.86        34
            CONSULTANT       0.67      0.40      0.50        20
              DESIGNER       0.73      1.00      0.84        19
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [102]:
import joblib

# Save the Random Forest model
joblib.dump(rf_model, 'random_forest_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [107]:
# Load the saved model and vectorizer
rf_model = joblib.load('/content/random_forest_model.pkl')
tfidf = joblib.load('/content/tfidf_vectorizer.pkl')

# Example resume text for prediction
new_resume = ["Nursing."]

# Preprocess the resume text using the saved TF-IDF vectorizer
new_resume_tfidf = tfidf.transform(new_resume)

# Predict the job role
predicted_role = rf_model.predict(new_resume_tfidf)
print(f"Predicted Job Role: {predicted_role}")


Predicted Job Role: ['BUSINESS-DEVELOPMENT']


In [108]:
import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))  # Check class distribution


{'ACCOUNTANT': 89, 'ADVOCATE': 88, 'AGRICULTURE': 55, 'APPAREL': 77, 'ARTS': 85, 'AUTOMOBILE': 30, 'AVIATION': 96, 'BANKING': 92, 'BPO': 20, 'BUSINESS-DEVELOPMENT': 93, 'CHEF': 94, 'CONSTRUCTION': 78, 'CONSULTANT': 95, 'DESIGNER': 88, 'DIGITAL-MEDIA': 71, 'ENGINEERING': 97, 'FINANCE': 99, 'FITNESS': 98, 'HEALTHCARE': 95, 'HR': 92, 'INFORMATION-TECHNOLOGY': 94, 'PUBLIC-RELATIONS': 94, 'SALES': 87, 'TEACHER': 80}


In [109]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')


In [111]:
# Train the Random Forest model on the TF-IDF transformed training data
rf_model.fit(X_train_tfidf, y_train)


In [112]:
# Transform new resume using the same TF-IDF vectorizer
new_resume = [" machine learning and Python."]
new_resume_tfidf = tfidf.transform(new_resume)


In [8]:
# Example of transforming new data using the same TF-IDF vectorizer
new_resume = [" Python."]
new_resume_tfidf = tfidf.transform(new_resume)

# Make prediction using the trained model
predicted_role = rf_model.predict(new_resume_tfidf)
print(f"Predicted Job Role: {predicted_role}")


NameError: name 'tfidf' is not defined

In [100]:
#using SVM to find out the optimal sol
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')  # Using linear kernel for text data
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}")


SVM Accuracy: 0.6317907444668008
