# **NLP Project: Dataset Preprocessing**

*Master in Machine Learning for Health, 2023~2024*

*Authors: Daniel Corrales, Jaime Fernández & Rafael Rodríguez*

---

In [None]:
import pandas as pd
import spacy
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# For fancy table Display
%load_ext google.colab.data_table

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

The raw scraped data has to be further preprocessed:
1. Clean DataFrame: remove duplicate rows and rows containing NaN values.
2. Remove salary information from decription column.
3. Convert to salary per year and create new target column `avg_pay`, this is the target variable to be predicted by the NLP model.
4. Remove pay columns.
5. Combine all info into the same text with format:

  `Job title | Company name | Location | Job type (if provided) | Description`

### Data Loading

In [None]:
path = '/content/drive/MyDrive/NLP/Project/'

dfs = []

for filename in os.listdir(path):
    if filename.endswith('.csv'):
        file_path = os.path.join(path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
print(f"Total numer of jobs before cleaning: {df.shape[0]}")

(13, 6)


In [None]:
df.head(1)

Unnamed: 0,title,company_name,location,pay,job_type,description
0,Silicon Engineer: Opportunities for University...,Microsoft,"Raleigh, NC","$76,400 - $151,800 por año",Full-time,"Come build community, explore your passions an..."


### 1. Clean DataFrame

In [None]:
df = df.drop_duplicates()
df = df.dropna()
print(f"Total numer of jobs after cleaning: {df.shape[0]}")

### 2. Remove Salary Information from Description

In [None]:
def remove_salary(text):
  doc = nlp(text)
  filtered_sents = [sent.text for sent in doc.sents if "$" not in sent.text]
  filtered_text = " ".join(filtered_sents)

  return filtered_text

In [None]:
# Apply the function to the 'description' column
df['description'] = df['description'].apply(remove_salary)
df.head(1)

Unnamed: 0,title,company_name,location,pay,job_type,description
0,Silicon Engineer: Opportunities for University...,Microsoft,"Raleigh, NC","$76,400 - $151,800 por año",Full-time,"Come build community, explore your passions an..."


### 3. Salary/hour to Salary/year and Create Target Columns

In [None]:
def process_salary(text):
  hours_week = 40 # Assume typical working schedule
  weeks_year = 52
  days_week = 5

  doc = nlp(text.lower())
  texts = [token.text for token in doc]
  digits = [float(token.text.replace(',','')) for token in doc if token.text.replace(',', '').replace('.','').isdigit()]

  min = np.min(digits)
  max = np.max(digits)

  if 'hora' in texts: # Convert to salary per year
    min *= hours_week * weeks_year
    max *= hours_week * weeks_year

  elif 'mes' in texts: # Convert to salary per year
    min *= 12
    max *= 12

  elif 'dia' in texts or 'día' in texts: # Convert to salary per year
    min *= days_week * weeks_year
    max *= days_week * weeks_year

  return np.mean([min, max])

In [None]:
df['avg_pay'] = zip(*df['pay'].map(process_salary))
df.head(1)

Unnamed: 0,title,company_name,location,pay,job_type,description,min_pay,max_pay
0,Silicon Engineer: Opportunities for University...,Microsoft,"Raleigh, NC","$76,400 - $151,800 por año",Full-time,"Come build community, explore your passions an...",76400.0,151800.0


### 4. Remove Pay Column

In [None]:
df = df.drop(columns=['pay'])
df.head(1)

Unnamed: 0,title,company_name,location,job_type,description,min_pay,max_pay
0,Silicon Engineer: Opportunities for University...,Microsoft,"Raleigh, NC",Full-time,"Come build community, explore your passions an...",76400.0,151800.0


### 5. Join Information in One Text

In [None]:
df['full_info'] = df.apply(lambda row: f"Job title: {row['title']}. Company name: {row['company_name']}. \
                                        Location: {row['location']}. Job type: {row['job_type']}. {row['description']}", axis=1)
df.head(1)

Unnamed: 0,title,company_name,location,job_type,description,min_pay,max_pay,full_info
0,Silicon Engineer: Opportunities for University...,Microsoft,"Raleigh, NC",Full-time,"Come build community, explore your passions an...",76400.0,151800.0,Job title: Silicon Engineer: Opportunities for...


### Save DataFrame

In [None]:
df.to_csv(path + 'jobs_processed.csv', index=False)