Import Libraries 

In [2]:
import pandas as pd
import spacy
import re


Load SpaCy Model

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")




Load the Raw Resume Dataset

In [4]:
# Load your raw dataset (assumes the original CSV file)
df = pd.read_csv("../data/resumes_dataset.csv")

# Drop empty rows
df = df.dropna()
df = df[df['Resume'].str.strip() != '']
df.reset_index(drop=True, inplace=True)

df.head()


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


Define Resume Cleaning Function Using SpaCy

This function takes raw resume text and returns a clean, normalized version by:

Lowercasing

Removing special characters and numbers

Tokenizing the text

Removing stopwords

Lemmatizing each word



In [5]:
def clean_resume_spacy(text):
    # Step 1: lowercase and remove unwanted characters
    text = text.lower()
    text = re.sub(r'\n|\r|\t', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Step 2: process text with spaCy
    doc = nlp(text)

    # Step 3: remove stopwords and short tokens, lemmatize
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and len(token.text) > 2
    ]

    return ' '.join(tokens)


Apply the Cleaning Function

In [6]:
df['Cleaned_Resume'] = df['Resume'].apply(clean_resume_spacy)


View Cleaned Sample Output

In [7]:
df[['Resume', 'Cleaned_Resume']].head()


Unnamed: 0,Resume,Cleaned_Resume
0,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...
1,Education Details \r\nMay 2013 to May 2017 B.E...,education detail uit rgpv data sci...
2,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill python sap hana tabl...
4,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad har...


Save the Cleaned Dataset

In [8]:
df.to_csv("../data/resumes_cleaned.csv", index=False)
print("✅ Cleaned resumes saved!")


✅ Cleaned resumes saved!
