In [1]:
import pandas as pd

In [2]:
df_raw = pd.read_csv("resume_data.csv")
df = df_raw.copy()

In [3]:
df.shape

(9544, 35)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9544 entries, 0 to 9543
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   address                              784 non-null    object 
 1   career_objective                     4740 non-null   object 
 2   skills                               9488 non-null   object 
 3   educational_institution_name         9460 non-null   object 
 4   degree_names                         9460 non-null   object 
 5   passing_years                        9460 non-null   object 
 6   educational_results                  9460 non-null   object 
 7   result_types                         9460 non-null   object 
 8   major_field_of_studies               9460 non-null   object 
 9   professional_company_names           9460 non-null   object 
 10  company_urls                         9460 non-null   object 
 11  start_dates                   

In [5]:
# creating a working copy
df_work = df.copy()
print(df_work.shape)

(9544, 35)


In [6]:
print(df_work.columns.tolist())

['address', 'career_objective', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', 'extra_curricular_activity_types', 'extra_curricular_organization_names', 'extra_curricular_organization_links', 'role_positions', 'languages', 'proficiency_levels', 'certification_providers', 'certification_skills', 'online_links', 'issue_dates', 'expiry_dates', '\ufeffjob_position_name', 'educationaL_requirements', 'experiencere_requirement', 'age_requirement', 'responsibilities.1', 'skills_required', 'matched_score']


In [7]:
df_work.rename(columns={'\ufeffjob_position_name': 'job_position_name'}, inplace=True)

In [8]:
print(df_work.columns.tolist())

['address', 'career_objective', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', 'extra_curricular_activity_types', 'extra_curricular_organization_names', 'extra_curricular_organization_links', 'role_positions', 'languages', 'proficiency_levels', 'certification_providers', 'certification_skills', 'online_links', 'issue_dates', 'expiry_dates', 'job_position_name', 'educationaL_requirements', 'experiencere_requirement', 'age_requirement', 'responsibilities.1', 'skills_required', 'matched_score']


In [9]:
import ast

def safe_parse_list(x):
    """Convert stringified lists into actual Python lists safely."""
    if isinstance(x, list):
        return x
    if isinstance(x, str) and x.strip().startswith('['):
        try:
            return [i.strip().strip("'\"") for i in ast.literal_eval(x)]
        except Exception:
            return []
    elif isinstance(x, str) and x.strip():
        return [x.strip()]
    else:
        return []

# Columns that look like lists
list_cols = [
    'skills', 'skills_required', 'related_skils_in_job',
    'educational_institution_name', 'degree_names',
    'major_field_of_studies', 'responsibilities'
]

# Apply to each one
for col in list_cols:
    if col in df_work.columns:
        df_work[col] = df_work[col].apply(safe_parse_list)

print("Parsed list-like columns successfully.")

Parsed list-like columns successfully.


In [10]:
# since we need keyords, we will merge skils, skills_required and related_skills_in_job
def merge_all_skills(row):
    combined = []
    for col in ['skills', 'skills_required', 'related_skils_in_job']:
        if col in row and isinstance(row[col], list):
            combined.extend([s for s in row[col] if isinstance(s, str)])
    # Normalize, deduplicate
    clean = []
    seen = set()
    for s in combined:
        s_lower = s.lower().strip()
        if s_lower and s_lower not in seen:
            seen.add(s_lower)
            clean.append(s_lower)
    return clean

df_work['skills_combined'] = df_work.apply(merge_all_skills, axis=1)

In [11]:
df_work[['job_position_name', 'skills_combined']].head(10)

Unnamed: 0,job_position_name,skills_combined
0,Senior Software Engineer,"[big data, hadoop, hive, python, mapreduce, sp..."
1,Machine Learning (ML) Engineer,"[data analysis, data analytics, business analy..."
2,"Executive/ Senior Executive- Trade Marketing, ...","[software development, machine learning, deep ..."
3,Business Development Executive,"[accounts payables, accounts receivables, acco..."
4,Senior iOS Engineer,"[analytical reasoning, compliance testing know..."
5,AI Engineer,"[microsoft applications, network security, net..."
6,Senior iOS Engineer,"[machine learning, linear regression, ridge re..."
7,Senior iOS Engineer,"[maintenance, corrective maintenance, document..."
8,Mechanical Engineer,"[python, machine learning, mysql, data mining,..."
9,Business Development Executive,"[django, python, relational databases, restapi..."


In [12]:
# combining the most useful text fields into one column to make it easier to clean and process later
def join_list(val):
    if isinstance(val, list):
        return ' '.join(val)
    elif isinstance(val, str):
        return val
    else:
        return ''

df_work['combined_text'] = (
    df_work['career_objective'].fillna('') + ' ' +
    df_work['skills_combined'].apply(join_list) + ' ' +
    df_work['major_field_of_studies'].apply(join_list) + ' ' +
    df_work['degree_names'].apply(join_list) + ' ' +
    df_work['responsibilities'].apply(join_list)
).str.replace(r'\s+', ' ', regex=True).str.strip()

In [13]:
df_work[['job_position_name', 'combined_text']].head(5)

Unnamed: 0,job_position_name,combined_text
0,Senior Software Engineer,Big data analytics working and database wareho...
1,Machine Learning (ML) Engineer,Fresher looking to join as a data analyst and ...
2,"Executive/ Senior Executive- Trade Marketing, ...",software development machine learning deep lea...
3,Business Development Executive,To obtain a position in a fast-paced business ...
4,Senior iOS Engineer,Professional accountant with an outstanding wo...


In [14]:
import re

# Defining a function to clean resume text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[\n\r\t]+', ' ', text)          # remove newlines/tabs
    text = re.sub(r'http\S+|www\S+', '', text)      # remove URLs
    text = re.sub(r'[^a-zA-Z ]', ' ', text)         # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text)                # collapse multiple spaces
    return text.lower().strip()                     # normalize to lowercase

# Apply cleaning to create new column
df_work['cleaned_text'] = df_work['combined_text'].apply(clean_text)

# Quick check
df_work[['job_position_name', 'cleaned_text']].head(5)

Unnamed: 0,job_position_name,cleaned_text
0,Senior Software Engineer,big data analytics working and database wareho...
1,Machine Learning (ML) Engineer,fresher looking to join as a data analyst and ...
2,"Executive/ Senior Executive- Trade Marketing, ...",software development machine learning deep lea...
3,Business Development Executive,to obtain a position in a fast paced business ...
4,Senior iOS Engineer,professional accountant with an outstanding wo...


In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rochanehurst/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rochanehurst/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Create stopword set for filtering
stop_words = set(stopwords.words('english'))

In [17]:
# Define a tokenizer
def tokenize_text(text):
    if not isinstance(text, str):
        return ''
    tokens = word_tokenize(text)  # split into words
    tokens = [
        word for word in tokens
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    return ' '.join(tokens)  # join back to a space-separated string

In [18]:
# Apply tokenization
df_work['tokens'] = df_work['cleaned_text'].apply(tokenize_text)

# Drop any rows that have empty tokens (if any)
df_work = df_work[df_work['tokens'].str.strip() != '']

# Quick check
df_work[['job_position_name', 'tokens']].head(10)

Unnamed: 0,job_position_name,tokens
0,Senior Software Engineer,big data analytics working database warehouse ...
1,Machine Learning (ML) Engineer,fresher looking join data analyst junior data ...
2,"Executive/ Senior Executive- Trade Marketing, ...",software development machine learning deep lea...
3,Business Development Executive,obtain position fast paced business office env...
4,Senior iOS Engineer,professional accountant outstanding work ethic...
5,AI Engineer,secure specialist desktop support network admi...
6,Senior iOS Engineer,machine learning linear regression ridge regre...
7,Senior iOS Engineer,maintenance corrective maintenance documentati...
8,Mechanical Engineer,certified data analyst degree electronics engi...
9,Business Development Executive,django python relational databases restapi git...


In [19]:
# Check dataset integrity
print("Dataset shape:", df_work.shape)
print("\nColumns:", list(df_work.columns))


Dataset shape: (9544, 39)

Columns: ['address', 'career_objective', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', 'extra_curricular_activity_types', 'extra_curricular_organization_names', 'extra_curricular_organization_links', 'role_positions', 'languages', 'proficiency_levels', 'certification_providers', 'certification_skills', 'online_links', 'issue_dates', 'expiry_dates', 'job_position_name', 'educationaL_requirements', 'experiencere_requirement', 'age_requirement', 'responsibilities.1', 'skills_required', 'matched_score', 'skills_combined', 'combined_text', 'cleaned_text', 'tokens']


In [20]:
# Check for missing values
print("\nMissing values per column:\n")
print(df_work.isnull().sum())

# Preview random samples to confirm text content
df_work[['job_position_name', 'tokens']].sample(5, random_state=42)


Missing values per column:

address                                8760
career_objective                       4804
skills                                    0
educational_institution_name              0
degree_names                              0
passing_years                            84
educational_results                      84
result_types                             84
major_field_of_studies                    0
professional_company_names               84
company_urls                             84
start_dates                              84
end_dates                                84
related_skils_in_job                      0
positions                                84
locations                                84
responsibilities                          0
extra_curricular_activity_types        6118
extra_curricular_organization_names    6118
extra_curricular_organization_links    6118
role_positions                         6118
languages                              8844
pro

Unnamed: 0,job_position_name,tokens
5470,Data Engineer,assembly autocad automation budgets budget cnc...
8982,HR Officer,client engagement communication analytical bus...
4704,Mechanical Engineer,data analytics natural language processing pro...
1954,Civil Engineer,microsoft excel project visio adobe engineerin...
3146,AI Engineer,post graduate knowledge industry level machine...


In [21]:
# Check duplicates
duplicate_count = df_work.duplicated(subset=['job_position_name', 'tokens']).sum()
print(f"{duplicate_count} potential duplicate rows.")

0 potential duplicate rows.


In [22]:
# Remove extra spaces and short filler words (if any slipped through)
df_work['tokens'] = df_work['tokens'].str.replace(r'\s+', ' ', regex=True).str.strip()
df_work['tokens'] = df_work['tokens'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))

In [24]:
# Combine tokens, skills, and degree info into one text column.
# This helps the model understand both what the person wrote 
# in their resume (like experience and projects) and the specific
# skills or degrees they listed. 
# If we only used tokens, it might miss some important keywords 
# like “python” or “machine learning” that appear in the skills column.
# Putting everything together gives the model a fuller picture
# of what each resume is really about.
df_work['model_text'] = (
    df_work['tokens'] + ' ' +
    df_work['skills_combined'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' ' +
    df_work['degree_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
)

In [27]:
# Quick check to make sure model_text looks correct
df_work[['job_position_name', 'model_text']].sample(5, random_state=42)

Unnamed: 0,job_position_name,model_text
5470,Data Engineer,assembly autocad automation budgets budget cnc...
8982,HR Officer,client engagement communication analytical bus...
4704,Mechanical Engineer,data analytics natural language processing pro...
1954,Civil Engineer,microsoft excel project visio adobe engineerin...
3146,AI Engineer,post graduate knowledge industry level machine...


In [31]:
# The dataset now has all (hopefully!) the important information cleaned and ready
# for modeling: skills, degrees, job titles, and processed text.
# We’re saving it so we can use it later for TF-IDF and model training.

output_path = "Cleaned_Resume_Dataset.csv"
df_work.to_csv(output_path, index=False, encoding='utf-8')

print(f"Cleaned dataset saved successfully as: {output_path}")
print("Total rows:", len(df_work))

Cleaned dataset saved successfully as: Cleaned_Resume_Dataset.csv
Total rows: 9544


In [32]:
#Save a smaller "model-ready" version of the dataset

# This version keeps only the columns that will be useful later:
# - job_position_name = what role the resume belongs to (target label)
# - skills_combined = for reference or to build skill-based recommendations
# - model_text = the final cleaned and combined text we’ll feed into the model
# We’re dropping the rest to make the file lighter and easier to work with.

model_columns = ['job_position_name', 'skills_combined', 'model_text']
df_model = df_work[model_columns]

output_path_model = "Cleaned_Resume_ModelReady.csv"
df_model.to_csv(output_path_model, index=False, encoding='utf-8')
