# 1. Imports

In [1]:
import re

#import nltk
#Uncomment for first run to download datasets.
#nltk.download()

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from io import StringIO

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.decomposition import TruncatedSVD

from sklearn import metrics

# 2. Utilty and Common Functions (Please document sufficiently)

In [2]:
# Given an input string, this method performs the folowing operations:
# 1. Mask string fragments like \x<nn>
# 2. Mask string fragments for escape sequences like \a \b \t \n
# 3. Mask all numbers
# 4. Create a list of all the words containing letters from a-z and A-Z
# 5. Return the final filtered list of words.

def clean(text):
    # Mask string fragments like \x<nn>
    filtered_string = re.sub(r'\\x[a-z]{0,2}[0-9]{0,2}', "", text)
    
    # Mask string fragments for escape sequences like \a \b \t \n
    filtered_string = re.sub(r'\\a+', "", filtered_string)
    filtered_string = re.sub(r'\\b+', "", filtered_string)
    filtered_string = re.sub(r'\\t+', " ", filtered_string)
    filtered_string = re.sub(r'\\n+', " ", filtered_string)
    
    # Mask all numbers
    filtered_string = re.sub(r'[0-9]+', "", filtered_string)
    
    # Remove single letter words.
    filtered_string = mask_single_letters(filtered_string)
    
    # Create a list of all the words containing letters from a-z and A-Z
    filtered_list = re.findall(r"\b([a-zA-Z]+)\b", filtered_string, re.M)
    
    # Return the final filtered List
    return filtered_list

In [3]:
# Give an input string this method replaces all the single letter words with empty string,
# basically removing them from the resultant string.

def mask_single_letters(text):
    filtered_string = re.sub(r'\b([a-zA-Z]{1})\b', "", text)
    return filtered_string

In [4]:
# Provided a string, this method creates and returns a set containing all the unique words
# in the passed string.

def unique_words_str(text):
    result = set([])
    for word in text.split():
        result.add(word)
    
    return result

In [5]:
# Provided a list of strings, this method creates and returns a set containing all the unique words
# considering all the strings in the passed list.

def unique_words_list(data):
    result = set([])
    for text in data:
        result.update(unique_words_str(text))
    
    return result

# 3. Read Dataset and Clean

In [6]:
df = pd.read_csv("resume_data.csv")

In [7]:
df.sample(10)

Unnamed: 0,ID,Category,Resume
505,506,Health & Fitness,"b'Francis Preve\n1111 Any Street, New York, NY..."
203,204,Information Technology,"b""SOFTWARE ENGINEER Resume Sample www.timesres..."
569,570,Agricultural,b'Habitat Restoration Specialist Resume Templa...
59,60,Designing,"b'Brian Brown\n\n999 Main Street\nAny Town, NY..."
223,224,Information Technology,b'Gary White\nData analyst\nAREAS OF EXPERTISE...
764,765,Automobile,b'Abhilasha Sinha\n\nB.Tech (Hons.) Automobile...
91,92,Designing,b'David Cole\n\nweb developer\n843-388-2405 \x...
161,162,Managment,b'MR\n99 Example Street\n\nN A M E\n+60 12\n\n...
956,957,Engineering,b'ALEX SMITH\n128 Chemical Drive\nThunder Bay ...
406,407,Advocate,b'NAME:\n\nJohn Russell Hicks\n\nBIRTHPLACE:\n...


> ### Drop all attributes except Resume

In [8]:
df = df['Resume']

In [9]:
print("Dataset Size: %d" %(df.shape[0]))

Dataset Size: 1219


In [10]:
df.sample(5)

450    b'Jenny Smith\nJsmith1234@utk.edu\nCurrent Add...
476    b'R o b yn Y o u n g\n15 Applegarth St. \xef\x...
467    b"School Business Manager Resume\n\nGrover Qui...
919    b'Miles Peters\n1 Main Street, New Cityland, C...
504    b'Office Assistant Resume\nSeattle, WA 98117\n...
Name: Resume, dtype: object

> ### Pick one sample record

In [11]:
sample = df.iloc[10]
print(sample)

b'RESUME WORLD INC.\n1200 Markham Road, Suite 108, Toronto, Ontario M1H 3C3\nTel: (416) 438.3606 / E-mail: info@resumworld.ca\nCAREER PROFILE\nA Human Resources & Payroll Manager with over 15 years of progressive experience augmented by a\nstrong post-secondary background in Human Resources, Accounting and Business Administration.\nExperienced in Payroll Administration, Benefits/Compensation Administration, Talent/Performance\nManagement, Recruitment/Selection, Training/Development, HR Policy and Organizational Design,\nContract Negotiation, Job Costing Analysis, People Management, Progress Improvement, and\nStrategic Planning. Sound knowledge of Labour Relations, Occupational Health & Safety, Pay Equity\nand other related labour laws. Proven ability to function as a Strategic HR Business Partner and develop\nand implement successful human resources management strategies to support corporate mandate.\nCreative and innovative thinker with effective human resources management and goal se

> ### Cleanup

In [12]:
cleaned = clean(sample)
print(cleaned)

['RESUME', 'WORLD', 'INC', 'Markham', 'Road', 'Suite', 'Toronto', 'Ontario', 'MH', 'Tel', 'mail', 'info', 'resumworld', 'ca', 'CAREER', 'PROFILE', 'Human', 'Resources', 'Payroll', 'Manager', 'with', 'over', 'years', 'of', 'progressive', 'experience', 'augmented', 'by', 'strong', 'post', 'secondary', 'background', 'in', 'Human', 'Resources', 'Accounting', 'and', 'Business', 'Administration', 'Experienced', 'in', 'Payroll', 'Administration', 'Benefits', 'Compensation', 'Administration', 'Talent', 'Performance', 'Management', 'Recruitment', 'Selection', 'Training', 'Development', 'HR', 'Policy', 'and', 'Organizational', 'Design', 'Contract', 'Negotiation', 'Job', 'Costing', 'Analysis', 'People', 'Management', 'Progress', 'Improvement', 'and', 'Strategic', 'Planning', 'Sound', 'knowledge', 'of', 'Labour', 'Relations', 'Occupational', 'Health', 'Safety', 'Pay', 'Equity', 'and', 'other', 'related', 'labour', 'laws', 'Proven', 'ability', 'to', 'function', 'as', 'Strategic', 'HR', 'Business', 

In [13]:
print('Original Record Length: %d, Unique Count: %d' %(len(sample), len(unique_words_str(sample))))

Original Record Length: 8202, Unique Count: 564


In [14]:
print('Cleaned Record Length: %d, Unique Count: %d' %(len(cleaned), len(unique_words_list(cleaned))))

Cleaned Record Length: 956, Unique Count: 517


In [15]:
print(unique_words_list(cleaned))

{'decision', 'MH', 'savings', 'strategic', 'existing', 'Handle', 'Prepared', 'advance', 'Systems', 'competency', 'Hamilton', 'values', 'preparation', 'the', 'claims', 'expectations', 'accounting', 'profiles', 'Implemented', 'DICKSON', 'setting', 'objective', 'monitor', 'enabling', 'progressive', 'performance', 'PROFILE', 'Present', 'Analyst', 'develop', 'ResumeWorld', 'issues', 'boards', 'IT', 'linking', 'warehouse', 'DataTrak', 'laws', 'Payroll', 'over', 'working', 'necessary', 'Assessed', 'program', 'divisional', 'maintained', 'manage', 'Minimized', 'Labour', 'strong', 'goal', 'considerations', 'ca', 'Health', 'implement', 'risk', 'track', 'divisions', 'presentation', 'integrating', 'Strategic', 'CONSUMER', 'first', 'from', 'Vice', 'Administered', 'effective', 'return', 'required', 'retaining', 'provincial', 'ways', 'Banks', 'with', 'advancement', 'coordinating', 'procedures', 'quarterly', 'YORK', 'Compensation', 'related', 'thinker', 'planning', 'controlled', 'efficiently', 'functio

# 4. Analysis

> ### Clean the entire dataset

In [16]:
cleaned_resume = []
for resume in df:
    cleaned = clean(resume)
    cleaned_resume.append(' '.join(cleaned))

> ### Perform TFIDF

In [17]:
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             min_df=5,
                             norm='l2',
                             encoding='latin-1',
                             ngram_range=(1, 2),
                             stop_words='english')

> ### Extract Feature Matrix

In [18]:
features = vectorizer.fit_transform(cleaned_resume)

> ### Print a sample

In [19]:
print(features[0])

  (0, 10746)	0.0286111721772
  (0, 18887)	0.0483312488461
  (0, 14904)	0.0433324537172
  (0, 2247)	0.0323347833593
  (0, 2630)	0.0476715695886
  (0, 8137)	0.070093658146
  (0, 10003)	0.0475717716842
  (0, 8870)	0.0807149836419
  (0, 3666)	0.0211256938275
  (0, 1175)	0.0932243817773
  (0, 10110)	0.0912037689199
  (0, 14357)	0.0640906522981
  (0, 9514)	0.0896078117067
  (0, 17140)	0.0730860282014
  (0, 18256)	0.0530250610552
  (0, 15643)	0.0353863890601
  (0, 14550)	0.056437508345
  (0, 952)	0.0258923520882
  (0, 8153)	0.0360088866709
  (0, 9472)	0.0921872171762
  (0, 7684)	0.0227536222392
  (0, 18111)	0.0476715695886
  (0, 2029)	0.113878072103
  (0, 540)	0.0337460058339
  (0, 7730)	0.0244520937915
  :	:
  (0, 11834)	0.056437508345
  (0, 6783)	0.0495272992679
  (0, 6788)	0.051872120313
  (0, 1260)	0.0510203072207
  (0, 11464)	0.0482483679682
  (0, 12562)	0.0528137560001
  (0, 2028)	0.0510203072207
  (0, 14851)	0.056437508345
  (0, 11)	0.0538664151393
  (0, 4589)	0.0495272992679
  (0, 134

> ### Linear Semantic Analysis

In [20]:
lsa = TruncatedSVD(n_components=10, n_iter=100)
lsa.fit(features)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=100,
       random_state=None, tol=0.0)

> ### Print the discovered Concepts with top 10 associated words.

In [21]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
management
skills
experience
business
new
university
work
sales
development
team
 
Concept 1:
dayjob
dayjob com
template
info dayjob
coventry
cv
cv template
www dayjob
info
personal
 
Concept 2:
dayjob
dayjob com
students
student
school
use
cv
personal
teaching
academic
 
Concept 3:
web
design
developer
java
software
sql
html
programming
user
project
 
Concept 4:
key responsibilities
major tasks
tasks included
responsibilities major
responsible generating
qualification
resume template
excellent
good
analyzing monthly
 
Concept 5:
association
key responsibilities
major tasks
tasks included
responsibilities major
responsible generating
health
international
conference
law
 
Concept 6:
accounting
accountant
date
birth
india
date birth
financial
knowledge
bank
law
 
Concept 7:
key responsibilities
major tasks
tasks included
responsibilities major
responsible generating
analyzing monthly
generating analyzing
generating
analyzing
known fluent
 
Concept 8:
monash
monash edu
monash u