<a href="https://colab.research.google.com/github/sheemapatel/nlp--/blob/main/8_8_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import spacy
nltk.download('punkt_tab')

# Simulated Resume Data
data = {
    'resume_text': [
        "Data Scientist. Core Skills: Python, Machine Learning, SQL, PyTorch. 5 years exp.\nProjects: Built a • predictive model using • scikit-learn and TensorFlow.",
        "Senior Software Engineer. Languages: Java, C++, Python. Worked on REST APIs. 8+ yrs experience. Certifications: AWS, Kubernetes.",
        "Marketing Specialist. Experienced in Digital Marketing, SEO, Google Analytics 4 (GA4). Handled $10k+ budgets. Strong communication skills."
    ]
}
df = pd.DataFrame(data)

# Q1. Load sample resumes and display first 3 rows. Check for noisy characters.
print("## Section A: Load & Explore")
print("Q1. First 3 rows of simulated resume data:\n")
print(df.head(3).to_markdown(index=False))
print("\n---")
print("Initial inspection shows noisy characters like '\\n' (newline), '•' (bullet point), and '$', which need cleaning.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...


## Section A: Load & Explore
Q1. First 3 rows of simulated resume data:

| resume_text                                                                                                                                |
|:-------------------------------------------------------------------------------------------------------------------------------------------|
| Data Scientist. Core Skills: Python, Machine Learning, SQL, PyTorch. 5 years exp.                                                          |
| Projects: Built a • predictive model using • scikit-learn and TensorFlow.                                                                  |
| Senior Software Engineer. Languages: Java, C++, Python. Worked on REST APIs. 8+ yrs experience. Certifications: AWS, Kubernetes.           |
| Marketing Specialist. Experienced in Digital Marketing, SEO, Google Analytics 4 (GA4). Handled $10k+ budgets. Strong communication skills. |

---
Initial inspection shows noisy characters like '\n' (newline), '

[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Initialize NLTK tools
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def nltk_pipeline(text):
    # 1. Clean special characters and digits
    # Retain only letters and spaces, convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

    # 2. Tokenize
    tokens = nltk.word_tokenize(text)

    # 3. Remove stop words and stem
    processed_tokens = []
    for word in tokens:
        if word not in stop_words:
            stemmed_word = stemmer.stem(word)
            processed_tokens.append(stemmed_word)

    return processed_tokens

df['nltk_tokens'] = df['resume_text'].apply(nltk_pipeline)

# Combine all tokens for frequency analysis
all_nltk_tokens = [token for sublist in df['nltk_tokens'] for token in sublist]

# Q2. Extract top 10 frequent stemmed words
token_counts = Counter(all_nltk_tokens)
top_10_nltk = token_counts.most_common(10)

print("\n## Section B: NLTK Preprocessing")
print("Q2. Top 10 Frequent Stemmed Words:\n")
print(pd.DataFrame(top_10_nltk, columns=['Stemmed Word', 'Frequency']).to_markdown(index=False))


## Section B: NLTK Preprocessing
Q2. Top 10 Frequent Stemmed Words:

| Stemmed Word   |   Frequency |
|:---------------|------------:|
| skill          |           2 |
| python         |           2 |
| market         |           2 |
| data           |           1 |
| scientist      |           1 |
| core           |           1 |
| machin         |           1 |
| learn          |           1 |
| sql            |           1 |
| pytorch        |           1 |


In [None]:
# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def spacy_pipeline(text):
    # Process text using spaCy's pipeline (tokenization, POS, lemmatization)
    doc = nlp(text)

    filtered_lemmas = []

    for token in doc:
        # 1. Filter: Must be alphabetic (removes symbols, digits) and not a stop word
        if token.is_alpha and not token.is_stop:

            # 2. Filter: Only include Nouns and Verbs (common indicators of skills/actions)
            if token.pos_ in ("NOUN", "VERB"):
                # 3. Lemmatize and lowercase
                lemma = token.lemma_.lower()
                filtered_lemmas.append(lemma)

    return filtered_lemmas

df['spacy_lemmas'] = df['resume_text'].apply(spacy_pipeline)

# Combine all lemmas for frequency analysis
all_spacy_lemmas = [lemma for sublist in df['spacy_lemmas'] for lemma in sublist]

# Q3. Extract top 10 frequent lemmas
lemma_counts = Counter(all_spacy_lemmas)
top_10_spacy = lemma_counts.most_common(10)

print("\n## Section C: spaCy Pipeline")
print("Q3. Top 10 Frequent Lemmas (Nouns/Verbs only):\n")
print(pd.DataFrame(top_10_spacy, columns=['Lemma (Noun/Verb)', 'Frequency']).to_markdown(index=False))


## Section C: spaCy Pipeline
Q3. Top 10 Frequent Lemmas (Nouns/Verbs only):

| Lemma (Noun/Verb)   |   Frequency |
|:--------------------|------------:|
| experience          |           2 |
| year                |           1 |
| project             |           1 |
| build               |           1 |
| model               |           1 |
| scikit              |           1 |
| learn               |           1 |
| language            |           1 |
| work                |           1 |
| rest                |           1 |
