In [1]:
%pip install spacy scikit-learn pandas



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Step 1: Create synthetic dataset with balanced classes
data = {
    'resume_text': [
        # Software Developer
        "Developed full-stack web apps using React and Django.",
        "Built scalable backend services with Node.js and MongoDB.",
        "Experienced in Python, Flask, and REST APIs.",
        "Java developer for enterprise applications.",
        "Deployed microservices using Docker and Kubernetes.",
        "Worked on C++ game development projects.",
        "Created CI/CD pipelines with Jenkins.",
        "Contributed to open-source JavaScript libraries.",
        "Developed Android apps using Kotlin.",
        "Expert in debugging and optimizing code performance.",

        # Data Scientist
        "Analyzed big data using Python and Spark.",
        "Built machine learning models using Scikit-learn.",
        "Expert in data visualization using Matplotlib and Seaborn.",
        "Experience with deep learning and TensorFlow.",
        "Worked on NLP projects using spaCy and NLTK.",
        "Built recommendation systems using collaborative filtering.",
        "Cleaned and processed raw datasets using pandas.",
        "Trained neural networks for image classification.",
        "Built dashboards using Power BI and Tableau.",
        "Used SQL for data analysis and reporting.",

        # Marketing
        "Digital marketing strategist with SEO expertise.",
        "Ran Facebook Ads and Google Ad campaigns.",
        "Created engaging content for social media platforms.",
        "Managed influencer marketing partnerships.",
        "Analyzed marketing KPIs and metrics.",
        "Experienced in email marketing and Mailchimp.",
        "Led product launches and branding efforts.",
        "Created landing pages using WordPress.",
        "Specialized in organic growth and content marketing.",
        "Certified in Google Analytics and Ads."
    ],
    'job_category': [
        "Software Developer"]*10 + ["Data Scientist"]*10 + ["Marketing"]*10
}

df = pd.DataFrame(data)

# Step 2: Preprocess the text
def preprocess_text(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    return " ".join(tokens)

df['clean_text'] = df['resume_text'].apply(preprocess_text)

# Step 3: Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['job_category']

# Step 4: Train-test split (with stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 5: Train classifier (you can switch between LogisticRegression or Naive Bayes)
model = LogisticRegression(max_iter=1000)
# model = MultinomialNB()

model.fit(X_train, y_train)

# Step 6: Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Step 7: Output
print("Predicted Categories:", y_pred)
print("Actual Categories:", y_test.values)
print("Accuracy Score:", accuracy)


Predicted Categories: ['Software Developer' 'Software Developer' 'Data Scientist'
 'Software Developer' 'Marketing' 'Software Developer']
Actual Categories: ['Software Developer' 'Marketing' 'Software Developer' 'Data Scientist'
 'Marketing' 'Data Scientist']
Accuracy Score: 0.3333333333333333
