# 1. Import libraries

In [1]:
!pip install matplotlib



In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Import dataset

In [31]:
df = pd.read_csv('csv-jobs_dataset_with_features/jobs_dataset_with_features.csv')

In [32]:
df.shape

(1615940, 2)

In [35]:
df['Role'].nunique()

376

In [37]:
df['Role'].unique()

array(['Social Media Manager', 'Frontend Web Developer',
       'Quality Control Manager', 'Wireless Network Engineer',
       'Conference Manager', 'Quality Assurance Analyst',
       'Classroom Teacher', 'User Interface Designer',
       'Interaction Designer', 'Wedding Consultant',
       'Performance Testing Specialist', 'Family Law Attorney',
       'Mechanical Design Engineer', 'Network Security Analyst',
       'Sales Account Manager', 'Product Brand Manager',
       'School Social Worker', 'Content Creator',
       'Deliverability Analyst', 'HR Coordinator', 'Legal Secretary',
       'Family Nurse Practitioner', 'Account Strategist',
       'Backend Developer', 'Supply Chain Coordinator',
       'B2B Sales Consultant', 'Structural Engineer',
       'Security Operations Center (SOC) Analyst', 'Front-End Developer',
       'Tax Planner', 'Event Coordinator', 'Clinical Psychologist',
       'Electrical Engineer', 'Lighting Designer',
       'Business Intelligence Analyst', 'Conten

# 3. Balance Dataset

In [39]:
df['Role'].value_counts()

Role
Interaction Designer            20580
Network Administrator           17470
User Interface Designer         14036
Social Media Manager            13945
User Experience Designer        13935
                                ...  
Inventory Control Specialist     3342
Budget Analyst                   3335
Clinical Nurse Manager           3324
Social Science Researcher        3321
Paid Advertising Specialist      3306
Name: count, Length: 376, dtype: int64

In [43]:
# Dropping classes with less than 6000 instances
min_count = 6000
role_counts = df['Role'].value_counts()
dropped_classes = role_counts[role_counts < min_count].index
filtered_df = df[~df['Role'].isin(dropped_classes)].reset_index(drop=True)

# Checking the updated role counts
filtered_df['Role'].value_counts()

Role
Interaction Designer          20580
Network Administrator         17470
User Interface Designer       14036
Social Media Manager          13945
User Experience Designer      13935
                              ...  
Benefits Coordinator           6839
Research Analyst               6830
Administrative Coordinator     6803
IT Support Specialist          6799
UI/UX Designer                 6743
Name: count, Length: 61, dtype: int64

# 4. Train-Test Split

In [52]:
df = filtered_df.sample(n=100000)

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Splitting the data into features (X) and target (y)
X = df['Features']
y = df['Role']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Encoding (TF-IDF vectorization)

In [56]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#  6. Train Random Forest Classifier


In [58]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [60]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

                                precision    recall  f1-score   support

             Account Executive       1.00      1.00      1.00       293
    Administrative Coordinator       1.00      1.00      1.00       286
             Automation Tester       1.00      1.00      1.00       253
             Backend Developer       1.00      1.00      1.00       436
          Benefits Coordinator       1.00      1.00      1.00       268
 Business Intelligence Analyst       1.00      1.00      1.00       274
   Client Relationship Manager       1.00      1.00      1.00       258
               Content Creator       1.00      1.00      1.00       253
            Content Strategist       1.00      1.00      1.00       286
      Customer Success Manager       1.00      1.00      1.00       411
   Customer Support Specialist       1.00      1.00      1.00       274
                  Data Analyst       1.00      1.00      1.00       390
         Data Entry Specialist       1.00      1.00      1.00  

# 7. Predictive System


In [72]:
import re  # Import Python's regular expressions module

def cleanResume(txt):
    # Remove URLs starting with http (e.g., http://example.com)
    cleanText = re.sub(r'http\S+\s', ' ', txt)

    # Remove common retweet and copy-paste tags (often seen in social media text)
    cleanText = re.sub(r'RT|cc', ' ', cleanText)

    # Remove hashtags and the word following them (e.g., #Python)
    cleanText = re.sub(r'#\S+\s', ' ', cleanText)

    # Remove mentions like @username
    cleanText = re.sub(r'@\S+', ' ', cleanText)

    # Remove punctuation and special characters using escaped characters
    cleanText = re.sub('[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)

    # Remove non-ASCII characters (e.g., emojis, foreign scripts)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)

    # Replace multiple spaces, tabs, and newlines with a single space
    cleanText = re.sub(r'\s+', ' ', cleanText)

    # Return the cleaned text
    return cleanText



# Prediction and Job Field Name
def job_recommendation(resume_text):
    resume_text= cleanResume(resume_text)
    resume_tfidf = tfidf_vectorizer.transform([resume_text])
    predicted_category_1 = rf_classifier.predict(resume_tfidf)[0]
    predicted_category_2 = rf_classifier.predict(resume_tfidf)[1]
    return predicted_category_1, predicted_category_2

In [74]:
# Example Usage
resume_file = """Objective:
A creative and detail-oriented Designer with a passion for visual communication and brand identity seeking opportunities to leverage design skills in a dynamic and collaborative environment.

Education:
- Bachelor of Fine Arts in Graphic Design, XYZ College, GPA: 3.7/4.0
- Diploma in Web Design, ABC Institute, GPA: 3.9/4.0

Skills:
- Proficient in Adobe Creative Suite (Photoshop, Illustrator, InDesign)
- Strong understanding of typography, layout, and color theory
- Experience in both print and digital design
- Ability to conceptualize and execute design projects from concept to completion
- Excellent attention to detail and time management skills

Experience:
Graphic Designer | XYZ Design Studio
- Created visually appealing graphics for various marketing materials, including brochures, flyers, and social media posts
- Collaborated with clients to understand their design needs and deliver creative solutions that align with their brand identity
- Worked closely with the marketing team to ensure consistency in brand messaging across all platforms

Freelance Designer
- Designed logos, branding materials, and website layouts for small businesses and startups
- Managed multiple projects simultaneously while meeting tight deadlines and maintaining quality standards
- Established and maintained strong client relationships through clear communication and exceptional service

Projects:
- Rebranding Campaign for XYZ Company: Led a team to redesign the company's logo, website, and marketing collateral, resulting in a 30% increase in brand recognition
- Packaging Design for ABC Product Launch: Developed eye-catching packaging designs for a new product line, contributing to a successful launch and positive customer feedback

Certifications:
- Adobe Certified Expert (ACE) in Adobe Illustrator
- Responsive Web Design Certification from Udemy

Languages:
- English (Native)
- Spanish (Intermediate)
"""

predicted_job_category = job_recommendation(resume_file)
print("Predicted Job Category:", predicted_job_category)

IndexError: index 1 is out of bounds for axis 0 with size 1

# 8. Save RandomForestClassifier Model


In [66]:
import pickle
pickle.dump(rf_classifier,open('rf_classifier_job_recommendation.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer_job_recommendation.pkl','wb'))