<a href="https://colab.research.google.com/github/prithikah23/java-projects/blob/master/Job_description_parsing_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ngrams
from collections import Counter

def extract_skill_keywords(job_description):
    # Download NLTK resources (only need to do this once)
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords')

    # Preprocess the text and convert to lowercase
    job_description = job_description.lower()

    # Tokenization
    words = word_tokenize(job_description)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Part-of-Speech (POS) tagging
    tagged_words = pos_tag(words)

    # Extract skill keywords based on nouns (NN and NNS), noun phrases (NNP and NNPS), bigrams, and trigrams
    skill_keywords = [word for word, tag in tagged_words if tag in ['NN', 'NNS', 'NNP', 'NNPS']]
    bigrams = [' '.join(bigram) for bigram in ngrams(words, 2)]
    trigrams = [' '.join(trigram) for trigram in ngrams(words, 3)]

    # Combine single-word, two-word, and three-word keywords
    all_keywords = skill_keywords + bigrams + trigrams

    # Count and rank skill keywords based on frequency
    skill_freq_counter = Counter(all_keywords)

    # Optionally, you can create a word cloud or visualize the most common skills using matplotlib

    return skill_freq_counter

# Example usage
job_description_text = """

Required skills:
- BS (or higher, e.g., MS, or PhD) in Computer Science or related engineering field involving coding
- Experienced implementing and scaling machine learning models in production environments (including recommendation engines and/or computer vision systems)
- Strong understanding of machine learning theory
- Hands on experience with Statistics
- Capable of quickly implementing prototypes of cutting-edge research papers
- Proficient in Python (i.e. Pandas, Numpy, scikit-learn, etc), R, TensorFlow, amongst other data science related tools and libraries
- Analytical mind and strong business acumen

If you're passionate about data science and is hungry to learn, please apply!"""
skill_keywords = extract_skill_keywords(job_description_text)
print(skill_keywords)


Counter({'science': 3, 'computer': 2, 'machine': 2, 'data': 2, 'science related': 2, 'machine learning': 2, 'data science': 2, 'skills': 1, 'bs': 1, 'ms': 1, 'phd': 1, 'engineering': 1, 'field': 1, 'learning': 1, 'models': 1, 'production': 1, 'environments': 1, 'recommendation': 1, 'engines': 1, 'vision': 1, 'systems': 1, 'hands': 1, 'statistics': 1, 'prototypes': 1, 'research': 1, 'papers': 1, 'proficient': 1, 'python': 1, 'pandas': 1, 'r': 1, 'tensorflow': 1, 'amongst': 1, 'tools': 1, 'mind': 1, 'business': 1, 'acumen': 1, 'hungry': 1, 'learn': 1, 'required skills': 1, 'skills :': 1, ': -': 1, '- bs': 1, 'bs (': 1, '( higher': 1, 'higher ,': 1, ', e.g.': 1, 'e.g. ,': 1, ', ms': 1, 'ms ,': 1, ', phd': 1, 'phd )': 1, ') computer': 1, 'computer science': 1, 'related engineering': 1, 'engineering field': 1, 'field involving': 1, 'involving coding': 1, 'coding -': 1, '- experienced': 1, 'experienced implementing': 1, 'implementing scaling': 1, 'scaling machine': 1, 'learning models': 1, '

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
