In [1]:
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [2]:
# Give an input string this method replaces all the single letter words with empty string,
# basically removing them from the resultant string.

def mask_single_letters(text):
    filtered_string = re.sub(r'\b([a-zA-Z]{1})\b', "", text)
    return filtered_string

In [3]:
# Give an input string this method replaces all the single letter words with empty string,
# basically removing them from the resultant string.

def mask_escape_sequences(text):
    # Mask string fragments like \x<nn>
    filtered_string = re.sub(r'\\x[a-z]{0,2}[0-9]{0,2}', " ", text)
    
    # Mask string fragments for escape sequences like \a \b \t \n
    filtered_string = re.sub(r'\\a+', "", filtered_string)
    filtered_string = re.sub(r'\\b+', "", filtered_string)
    filtered_string = re.sub(r'\\t+', " ", filtered_string)
    filtered_string = re.sub(r'\\n+', " ", filtered_string)
    
    return filtered_string

In [4]:
# Mask all numbers
def mask_numbers(text):
    filtered_string = re.sub(r'[0-9]+', "", text)
    return filtered_string

In [5]:
# Create a final string consisting only of all the words containing letters from a-z and A-Z
def mask_non_alphabet_words(text):
    filtered_list = re.findall(r'\b([a-zA-Z]+)\b', text, re.M)
    filtered_string = ' '.join(filtered_list)
    return filtered_string

In [6]:
# Mask email addresses
def mask_email(text):
    filtered_string = re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', "", text)
    return filtered_string

In [7]:
def stem(text):
    stemmer = LancasterStemmer()
    stems = []
    for word in text.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    return ' '.join(stems)

In [8]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in text.split():
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return ' '.join(lemmas)

In [9]:
# Provided a string, this method creates and returns a set containing all the unique words in the passed string.

def unique_words_str(text):
    result = set([])
    for word in text.split():
        result.add(word)
    
    return result

In [10]:
# Provided a list of strings, this method creates and returns a set containing all the unique words in the passed list.

def unique_words_list(data):
    result = set([])
    for word in data:
        result.add(word)
    
    return result

In [11]:
def mask_stopwords(text, words_to_remove, case_sensitive=True):
    filtered = []
    
    if(case_sensitive == False):
        stopwords = [item.lower() for item in words_to_remove]
        for word in text.split():
            if word.lower() not in words_to_remove:
                filtered.append(word)
    else:
        stopwords = words_to_remove
        for word in text.split():
            if word not in words_to_remove:
                filtered.append(word)
    return ' '.join(filtered)

In [12]:
def clean(text):
    filtered_text = mask_escape_sequences(text)
    filtered_text = mask_email(filtered_text)
    filtered_text = mask_numbers(filtered_text)
    filtered_text = mask_non_alphabet_words(filtered_text)
    filtered_text = mask_single_letters(filtered_text)
    filtered_text = mask_stopwords(filtered_text, stopwords.words('english'))
    #filtered_text = stem(filtered_text)
    filtered_text = lemmatize(filtered_text)
    
    return filtered_text

In [13]:
import pandas as pd
dataset = pd.read_csv("resume_data.csv", index_col=['ID'])

In [14]:
dataset['Resume'] = dataset['Resume'].map(clean)

In [15]:
dataset = dataset[dataset['Resume'] != '']

In [16]:
dataset.to_csv('resume_cleaned.csv')