<a href="https://colab.research.google.com/github/poojitha1502/nlp--/blob/main/NLP_Lab_08_08_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

data = {'Resume_Text': [
    "I am a software engineer with 5+ years of experience. My skills include Python, Java, and C++. I have worked on projects involving machine learning and web development.\n I am an excellent team player.",
    "Data Scientist with expertise in Python, R, and SQL. I have experience in statistical analysis, data visualization, and building predictive models. • Published a paper on a new machine learning algorithm.",
    "A marketing specialist with a background in digital marketing, social media management, and content creation. Skills: SEO, SEM, Adobe Photoshop. I have a proven track record of increasing brand visibility."
]}
resumes_df = pd.DataFrame(data)

print("First 3 rows of the sample resumes:")
print(resumes_df.head(3))
print("\nChecking for noisy characters...")
print(f"\n present: {'\n' in resumes_df['Resume_Text'].iloc[0]}")
print(f"• present: {'•' in resumes_df['Resume_Text'].iloc[1]}")

First 3 rows of the sample resumes:
                                         Resume_Text
0  I am a software engineer with 5+ years of expe...
1  Data Scientist with expertise in Python, R, an...
2  A marketing specialist with a background in di...

Checking for noisy characters...

 present: True
• present: True


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


def preprocess_nltk(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

resumes_df['nltk_tokens'] = resumes_df['Resume_Text'].apply(preprocess_nltk)

all_nltk_tokens = [token for sublist in resumes_df['nltk_tokens'] for token in sublist]
fdist_nltk = nltk.FreqDist(all_nltk_tokens)
top_10_nltk = fdist_nltk.most_common(10)

print("\n--- NLTK Preprocessing Results ---")
print("Processed Tokens for first resume:")
print(resumes_df['nltk_tokens'].iloc[0])
print("\nTop 10 frequent stemmed words:")
print(top_10_nltk)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



--- NLTK Preprocessing Results ---
Processed Tokens for first resume:
['softwar', 'engin', 'year', 'experi', 'skill', 'includ', 'python', 'java', 'work', 'project', 'involv', 'machin', 'learn', 'web', 'develop', 'excel', 'team', 'player']

Top 10 frequent stemmed words:
[('experi', 2), ('skill', 2), ('python', 2), ('machin', 2), ('learn', 2), ('data', 2), ('market', 2), ('softwar', 1), ('engin', 1), ('year', 1)]


In [None]:
import spacy

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def preprocess_spacy(text):
    doc = nlp(text)
    lemmas = [token.lemma_.lower() for token in doc if token.is_alpha and token.pos_ in ['NOUN', 'VERB']]
    return lemmas

resumes_df['spacy_lemmas'] = resumes_df['Resume_Text'].apply(preprocess_spacy)

all_spacy_lemmas = [lemma for sublist in resumes_df['spacy_lemmas'] for lemma in sublist]
fdist_spacy = nltk.FreqDist(all_spacy_lemmas)
top_10_spacy = fdist_spacy.most_common(10)

print("\n--- spaCy Preprocessing Results ---")
print("Processed Lemmas for first resume:")
print(resumes_df['spacy_lemmas'].iloc[0])
print("\nTop 10 frequent lemmas:")
print(top_10_spacy)


--- spaCy Preprocessing Results ---
Processed Lemmas for first resume:
['software', 'engineer', 'year', 'experience', 'skill', 'include', 'work', 'project', 'involve', 'machine', 'learning', 'web', 'development', 'team', 'player']

Top 10 frequent lemmas:
[('experience', 2), ('skill', 2), ('machine', 2), ('have', 2), ('marketing', 2), ('software', 1), ('engineer', 1), ('year', 1), ('include', 1), ('work', 1)]
