# Jared Ross

In [79]:
resume_text = "all_text1.txt"
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

def punct_space(token):
    "helper that elimates puncations and whitespace"
    return token.is_punct or token.is_space

def line_review(filename):
    "read reviews from the file and un-escapes orignal line break"
    with codecs.open(filename,encoding='utf_8') as f:
        for res in f:
            yield res.replace('\\n','\n')
 
def lemmatized_sentence_corpus(filename):
    "use spacy to parse, lemmatize and yield sentences"
    for parsed_res in nlp.pipe(line_review(filename),batch_size=10000,n_threads=3):
        for sent in parsed_res.sents:
            yield u' '.join([token.lemma_ for token in sent if not punct_space(token)])
            


In [80]:
unigram_filepath = 'unigram_sentences.txt'
normal_res = LineSentence(unigram_filepath)


In [81]:
from gensim.models import Word2Vec
word2vec_file = "vector_models"
res2vec = Word2Vec.load(word2vec_file)
res2vec.init_sims()

In [82]:
import pandas as pd

ordered = [(term,voc.index,voc.count)for term, voc in res2vec.wv.vocab.items()]
ordered_terms,term_indices,term_count = zip(*ordered)
word_vectors = pd.DataFrame(res2vec.wv.syn0[term_indices,:],index=ordered_terms)

word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
stakeholder,0.030171,-0.57894,-0.140312,-0.431368,0.13906,-0.184949,0.281638,-0.486964,-0.215997,0.317459,...,0.216009,-0.15512,0.385886,-0.430409,-0.447029,0.493696,0.024946,-0.05762,0.346376,-0.118032
umd,0.636582,0.145355,0.028948,-0.685784,0.183876,-0.192557,-0.1164,-0.037009,0.828778,0.78402,...,0.847879,0.130493,0.739471,0.38652,0.63024,-0.047898,0.055178,-0.832729,-1.177598,0.128166
gm,-0.527539,-0.081446,0.112848,-0.786913,0.202775,0.063773,0.429179,-0.203711,0.415994,0.222439,...,-0.277524,0.281285,0.228909,-0.335671,-0.228273,0.15785,0.122891,-0.223887,-0.111441,0.039465
distributed,-0.159051,0.488896,0.247771,-0.085302,-0.310324,0.278545,0.522985,0.363179,0.358134,0.166417,...,0.059012,0.595668,0.023368,-0.71738,-0.033351,0.241998,0.066923,0.613473,-0.285315,-0.787055
troubleshot,0.435049,-0.452891,0.243573,-0.324668,0.169153,-0.280413,0.20772,-0.788195,0.035686,-0.061155,...,0.072823,0.45767,0.753623,0.076704,0.286444,-0.241608,-0.254652,0.138938,-0.313926,-0.412263


In [83]:
res2vec.n_similarity(['python'],['java'])


0.27475053289729667

In [84]:
def get_related_terms(token,topn=10):
    "get the top n most similar terms"
    for word,similar in res2vec.wv.most_similar(positive = [token],topn=topn):
        print(word,':',round(similar,3))

In [85]:
get_related_terms('python')

3-sectored : 0.438
verilog : 0.421
bash : 0.402
languag : 0.387
ocaml : 0.374
languages : 0.369
html5 : 0.368
palisade : 0.366
stattools : 0.365
precisiontree : 0.363


In [86]:
get_related_terms('java')

j2me : 0.5
j2ee : 0.453
jsp : 0.432
j2se : 0.431
swings : 0.426
taglibs : 0.425
applets : 0.41
1.4 : 0.408
jdk1.6 : 0.395
html : 0.394


In [87]:
from sklearn.manifold import TSNE
import _pickle as pickle
import spacy

tsne_input = word_vectors.drop(spacy.en.English.Defaults.stop_words,errors='ignore')
tsne_input.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
stakeholder,0.030171,-0.57894,-0.140312,-0.431368,0.13906,-0.184949,0.281638,-0.486964,-0.215997,0.317459,...,0.216009,-0.15512,0.385886,-0.430409,-0.447029,0.493696,0.024946,-0.05762,0.346376,-0.118032
umd,0.636582,0.145355,0.028948,-0.685784,0.183876,-0.192557,-0.1164,-0.037009,0.828778,0.78402,...,0.847879,0.130493,0.739471,0.38652,0.63024,-0.047898,0.055178,-0.832729,-1.177598,0.128166
gm,-0.527539,-0.081446,0.112848,-0.786913,0.202775,0.063773,0.429179,-0.203711,0.415994,0.222439,...,-0.277524,0.281285,0.228909,-0.335671,-0.228273,0.15785,0.122891,-0.223887,-0.111441,0.039465
distributed,-0.159051,0.488896,0.247771,-0.085302,-0.310324,0.278545,0.522985,0.363179,0.358134,0.166417,...,0.059012,0.595668,0.023368,-0.71738,-0.033351,0.241998,0.066923,0.613473,-0.285315,-0.787055
troubleshot,0.435049,-0.452891,0.243573,-0.324668,0.169153,-0.280413,0.20772,-0.788195,0.035686,-0.061155,...,0.072823,0.45767,0.753623,0.076704,0.286444,-0.241608,-0.254652,0.138938,-0.313926,-0.412263


In [88]:
tsne_file = 'tsne_models'
tsne_vectors_file = 'tsne_vectors.npy'
tsne = TSNE()
tnse_vectors = tsne.fit_transform(tsne_input.values)
with open(tsne_file, 'wb') as f:
    pickle.dump(tsne,f)
    
pd.np.save(tsne_vectors_file,tnse_vectors)

with open(tsne_file,'rb') as f:
    tnse = pickle.load(f)
tnse_vectors = pd.np.load(tsne_vectors_file)

tnse_vectors = pd.DataFrame(tnse_vectors,index = pd.Index(tsne_input.index),columns=['x','y'])
tnse_vectors['word'] = tnse_vectors.index
print(tnse_vectors.head())

                     x         y         word
stakeholder  11.757952  0.286746  stakeholder
umd         -11.097395  4.400128          umd
gm           -6.868803 -4.331574           gm
distributed  -9.868099  0.031935  distributed
troubleshot  10.516786  4.142271  troubleshot


In [89]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [90]:
plot_data = ColumnDataSource(tnse_vectors)
# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x', u'y', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = True
tsne_plot.yaxis.visible = True
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [91]:
from sklearn.cluster import KMeans


word_vector = res2vec.wv.syn0
print(int(word_vector.shape[0]/50))
num_clusters = int(word_vector.shape[0]/50)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vector )

111


In [92]:
word_centroid_map = dict(zip( res2vec.wv.index2word, idx ))
for cluster in range(0,num_clusters):
    #
    # Print the cluster number  
    print ("\nCluster:",cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for key,item in word_centroid_map.items():
        if( item == cluster ):
            words.append(key)
    print(words)


Cluster: 0
['publishing']

Cluster: 1
['convert', 'into', 'translate', 'break', 'understanding', 'transform', 'consistent']

Cluster: 2
['spss', 'exam', 'score', 'dubai', 'professionals', '100%', 'above', 'methods', 'logistics', 'social']

Cluster: 3
['creation']

Cluster: 4
['creativity', 'leading', 'dedicated', 'succeed', 'very', 'contracting', 'term', 'leadership', 'mission', 'proposition', 'qualifications', 'valuable', 'spot', 'working', 'effectively', 'remain', 'craft', 'advance', 'commit', 'advised', 'dedicate', 'choi', 'initiative', 'involvement', 'verbal', 'gain', 'while', 'flexible', 'audience', 'rapidly', 'extra', 'practical', 'often', 'tackle', 'oversight', 'seek', 'passion', 'innovation', 'exceptional', 'competent', 'stable', 'cooperative', 'goal', 'bring', 'clinician', 'freddie', 'adoption', 'success', 'analytical', 'accomplish', 'continue', 'workplace', 'ceo', 'concept', 'acuman', 'way', 'fashion', 'influence', 'practice', 'entrepreneurial', 'face', 'individual', 'respec

['ucm', 'discoverer', 'reuters', 'aqualogic', 'navigator', '8i/9i/10', '9i/10g/11', 'oracle', 'apex', 'deploying', 'composite', 'sqlserver', 'r3', 'gate', 'portals', 'portal', 'vordel', 'sandbox', '11', 'mediator', 'adapter', 'databases', 'installed', 'documentum', 'middleware', 'ofsll', 'metadata', '8i', 'oc4j', '11g/', 'coherence', '10g/11', 'hyperion', 'osb', 'owsm', 'pentaho', 'manger', 'sit', 'bpa', 'adaptive', 'rules', 'viewer', 'integrations', 'bpel', 'suit', 'bam', 'adf', '9i/10', '10', 'oas', 'grid', 'soa', 'cfr', 'b2b', 'fusion', 'ohs', '11.5.10', 'plsql', 'webcenter', 'r2', 'dr', 'jdeveloper', 'primavera', 'oracle11', 'db', 'scom', 'oem', 'publisher', 'products', 'guard', 'oracle10', 'stack', 'bpm', 'timesten', 'mds', 'spatial', 'suite', 'gateway', 'obiee', 'dashboards', 'p6', '10gas', 'label', 'rcu', 'soa11', 'gis', 'builder', 'utilities', 'apps', 'g', 'cot', 'em', 'forms', 'bus', 'fmw']

Cluster: 9
['bottleneck', 'timeout', 'tuning', 'adjust', 'count', 'heap', 'memory', 's

['shadow', 'minimum', 'bsc', 'arduino', 'lte', 'metro', '1c', 'pcs', '∙', 'tilt', 'wave', 'gsm', 'radio', 'simulate', 'wcdma', 'channel', 'carrier', 'mock', 'umts', 'cellular', 'd.c.', 'drop', 'coverage', '\xad', 'antenna', 'surround', 'cdma', 'preliminary', 'suburb', 'downlink', 'stations', 'station', 'poseidon', 'alarm', 'ofdm', 'interference', 'microwave', 'cell', 'path', 'downtown', 'ericsson', 'simulation', 'wimax', '3-sectored', 'rf', 'block', 'd.c', 'nearby']

Cluster: 35
['fast', 'pace']

Cluster: 36
['format', 'edi', 'hl7', 'claim', 'dental', '835', 'mandate', 'x12', 'ansi', '270/271', 'institutional', 'crosswalk', 'p', 'reimbursement', '4010', 'icd', '834', '837', '276/277', 'hipaa', 'eligibility', 'translation', '5010']

Cluster: 37
['cybersecurity', 'directive', 'trusted', 'fips', 'remediate', 'prevention', 'customs', 'authority', 'unauthorized', 'ciso', 'privacy', 'poa&m', 'isso', 'border', 'pta', 'nist', 'guideline', 'artifact', 'disa', 'following', 'poa&ms', 'st&e', 'ssp

['lockheed', 'martin', 'pae']

Cluster: 66
['dns']

Cluster: 67
['ltd', 'pvt']

Cluster: 68
['schemas', 'odi', 'powerexchange', 'join', 'query', 'idms', 'cdm', 'queries', 'interactive', 'exp', 'ddl', 'edw', 'iq', 'schema', 'store', 'c#.net', 'package', 'stored', 'cognos', 'pump', 'ab', 'sql', 'star', 'ssrs', '2.6', 'dw', 'datastage', 'interim', 'templates', 'transformation', 'dictionary', 'target', 'extraction', 'dml', 'rdbms', 'bss', 'stag', 'mapping', 'mart', 'oltp', 'table', 'flat', 'warehousing', 'logical', 'fms', 'pro*c', 'classic', 'procedures', 'powerful', 'amp', 'fetch', 'microstrategy', 'netezza', 'cube', 'unload', 'trigger', 'addm', 'warehouse', 'cleanse', 'macro', 'aginity', 'view', 'power', 'powermart', 'mask', 'informatica', 'bo', 'structured', 'objects', 'sftp', 'mulesoft', 'tsql', 'spy', 'olap', 'triggers', 'packages', 'export', 'teradata', 'embarcadero', 'loading', 'udb', 'pl', 'tm1', 'ods', 'powercenter', 'omni', 'ase', 'nfp', 'db2', 'index', 'sybase', 'clothing', 'com

['mobile', 'chrome', 'cross', 'x', 'mozilla', 'browser', 'mac', 'explorer', 'samsung', 'smoke', 'appium', 'sencha', 'ie', 'usability', 'negative', 'safari', 'localization', 'firefox', 'android', '508', 'interruption', 'gui', 'black', 'injection', 'acceptance', 'edge', 'compatibility', 'vr', 'box', 'ios', 'exploratory', 'compatible', 'regression', 'opera', 'fault', 'ivr', 'boundary', 'msie', '10/8/7', 'sanity']

Cluster: 97
['shoot', 'shooting', 'trouble', 'ticket']

Cluster: 98
['outbound', 'inbound']

Cluster: 99
['topics', 'factory', 'bridges', 'adaptors', 'connections', 'pool', 'aq', 'configured', 'wldf', 'generic', 'jndi', 'connection', 'threads', 'destination', 'authenticator', 'pools', 'topic', 'eis', 'bridge', 'jta', 'alias', 'factories', 'stub', 'administered', 'snmp', 'adapters', 'modules', 'queue', 'concepts', 'partition', 'sources', 'queues', 'jms', '9.2/10', 'foreign', 'pools/', 'xa']

Cluster: 100
['lambda', 'scala', 's3', 'elastic', 'rds', 'node.js']

Cluster: 101
['umd',

In [93]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [94]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np

w2v = dict(zip(res2vec.wv.index2word, res2vec.wv.syn0))

etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])
svm_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("SVM", SVC())])


In [95]:
X=[['python','ocaml'],
  ['economics','sociology'],
  ['css','javascript']]
y=['scripting','majors','web']
etree_w2v.fit(X,y)
test_X=[['perl'],['psychology'],['asp.net']]
print(etree_w2v.predict(test_X))


['web' 'web' 'scripting']


# Eric Zhou

In [96]:
#all imports
import re
import pandas
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from fuzzywuzzy.process import dedupe
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from IPython.display import HTML



In [97]:
#import path & variables


#import and tokenize resume
resume_path = r"Yiyang (Eric) Zhou Resume 2017 Fall.txt"
#resume_path = r"/Users/yiyangzhou/Desktop/Zeng_Leah_Resume.txt"
resume_file = open(resume_path).read()
resume_file2 = open(resume_path).read()
resume_file2 = resume_file2.lower()

#import a list of majors 
major_df = pandas.read_excel('majors.xlsx')
major_df.columns
major_file = major_df['Majors'].values
major_lower = [item.lower() for item in major_file]
tokenizer = RegexpTokenizer(r'\w+')
resume_token = tokenizer.tokenize(resume_file) 
resume_token2 = tokenizer.tokenize(resume_file2)
updated_majors1 = []
indexes_majors1 = []
updated_majors2 = []
indexes_majors2 = []
updated_majors3 = []
indexes_majors3 = []
updated_majors4 = []
indexes_majors4 = []
majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4

#import a list of universities
university_df1 = pandas.read_excel('China_University.xlsx')
university_df2 = pandas.read_excel('India_University.xlsx')
university_df3 = pandas.read_excel('US_University.xlsx')
university_file1 = university_df1['Universities'].values
university_file2 = university_df2['Universities'].values
university_file3 = university_df3['Universities'].values
university_lower1 = [item.lower() for item in university_file1]
university_lower2 = [item.lower() for item in university_file2]
university_lower3 = [item.lower() for item in university_file3]
university_combined = university_lower1 + university_lower2 + university_lower3


In [98]:
#extract name 

def extract_first_name(resume):
    name = resume.split('\n', 1)[0]
    first_name = name.split(' ', 1)[0]
    return (first_name)
    print (first_name)
    
def extract_last_name(resume):
    name = resume.split('\n', 1)[0]
    last_name = name.split(' ', 1)[-1]
    return (last_name)
    print (last_name)
    
def extract_name(resume):
    name = extract_first_name(resume_file) + extract_last_name(resume_file)
    print (name)
    
extract_name(resume_file)

Yiyang(Eric)Zhou


In [99]:
#extract phone number

def check_phone_number1(resume):
    resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',))
    result = re.findall(r"\d{10}", resume2)                   
    result = ''.join(result)
    return (result)

def check_phone_number2(resume):
    resume2 = "".join(c for c in resume if c not in ('!','.','-','(',')',' ','+',))
    result = re.findall(r"\d{11}", resume2)                   
    result = ''.join(result)
    result = result[1:11]
    return (result)

def extract_phone_number(resume):
    try:
        return check_phone_number1(resume) 
        print (check_phone_number1(resume))
    except:
        return check_phone_number2(resume)
        print (check_phone_number2(resume))

extract_phone_number(resume_file)

'6125162045'

In [100]:
#scraping tools

r = requests.get('http://www.4icu.org/us/')
soup = BeautifulSoup(r.content, "lxml")
letters = soup.find_all(class_="lead")

university = {}
for element in letters:
    university[element.get_text()] = {}
    
list_keys = [ k for k in university.keys() ]

us_university_list = pd.DataFrame(list_keys)
print (us_university_list.head())

                                     0
0               Hampden-Sydney College
1                   Crossroads College
2               Mount Mercy University
3  Kutztown University of Pennsylvania
4                      Spelman College


In [166]:
#Extract University

def get_bigrams(input):
    n = 2
    result = []
    bigrams = ngrams(input, n)
    for grams in bigrams:
        x = "%s %s" % grams
        result.append(x)
    return (result)    
    print (result)


test1 = get_bigrams(resume_token)
#print (test1)

def get_threegrams(input):
    n = 3
    result = []
    threegrams = ngrams(input, n)
    for grams in threegrams:
        x = "%s %s %s" % grams
        result.append(x)
    return (result)    
    print (result)

test2 = get_threegrams(resume_token)
#print (test2)

def get_fourgrams(input):
    n = 4
    result = []
    fourgrams = ngrams(input, n)
    for grams in fourgrams:
        x = "%s %s %s %s" % grams
        result.append(x)
    return (result)    
    print (result)

test3 = get_fourgrams(resume_token)
#print (test3)

def get_fivegrams(input):
    n = 5
    result = []
    fivegrams = ngrams(input, n)
    for grams in fivegrams:
        x = "%s %s %s %s %s" % grams
        result.append(x)
    return (result)    
    print (result)

test4 = get_fivegrams(resume_token)
#print (test4)

def get_sixgrams(input):
    n = 6
    result = []
    sixgrams = ngrams(input, n)
    for grams in sixgrams:
        x = "%s %s %s %s %s %s" % grams
        result.append(x)
    return (result)    
    print (result)


def get_university(a,b):
    resume_university=[]
    for x in a:
        if x in b:
            resume_university.append(x)
    return (resume_university)
    print (resume_university)

def extract_university(resume_token_lower,university_combined):
    unigram_university = get_university(resume_token_lower, university_combined)  
    bigram_university = get_university(get_bigrams(resume_token_lower), university_combined)
    threegram_university = get_university(get_threegrams(resume_token_lower), university_combined)
    fourgram_university = get_university(get_fourgrams(resume_token_lower), university_combined)
    fivegram_university = get_university(get_fivegrams(resume_token_lower), university_combined)
    sixgram_university = get_university(get_sixgrams(resume_token_lower), university_combined)
    combined_university_extraction = set(bigram_university + threegram_university + fourgram_university + fivegram_university + sixgram_university)
    return combined_university_extraction

combined_university_extraction = extract_university(resume_token2,university_combined)

combined_university_extraction

{'university of maryland', 'university of minnesota'}

In [167]:
#extract GPA:
def extract_GPA(resume):
    result = re.search(r'(GPA|gpa): ?\d.\d{1,}',resume)
    if result:
        result = result.group(0)
    return (result)
    print (result)

extract_GPA(resume_file)

'gpa: 3.4'

In [168]:
#extract majors:

def extract_majors(a,b):
    majors=[]
    for x in a:
        if x in b:
            majors.append(x)
    return (majors)
    print (majors)
    
unigram_major = extract_majors(resume_token2, major_lower)  
bigram_major = extract_majors(get_bigrams(resume_token2), major_lower)
threegram_major = extract_majors(get_threegrams(resume_token2), major_lower)
combined_majors_list = unigram_major + bigram_major + threegram_major

major_distinct = []
for i in combined_majors_list:
    if i not in major_distinct:
        major_distinct.append(i)
        
#print (combined_majors_list)
print (major_distinct)



['education', 'business', 'information', 'management', 'accounting', 'finance', 'insurance', 'web', 'development', 'home', 'specialist', 'history', 'analyst', 'marketing', 'english', 'information system', 'accounting and finance']


In [169]:
#get major indexes 
dict = {'Name': 5}
for i, element in enumerate(major_distinct):
   x = resume_file2.find(element)
   dict[element] = x
   

del dict['Name']
print(dict)

{'analyst': 2318, 'english': 3327, 'accounting and finance': 416, 'history': 1245, 'specialist': 1080, 'web': 800, 'development': 896, 'information': 220, 'finance': 431, 'accounting': 416, 'business': 144, 'insurance': 447, 'education': 108, 'information system': 220, 'home': 1051, 'marketing': 2799, 'management': 326}


In [170]:
#get indexes for specific terms

#bachelor
regular_expression = re.compile(r"/BA|BS|Bachelor of Science|Bachelor of Arts|BBA |B/A|Bachelor of Business Administration/", re.IGNORECASE)
bach_major_result = re.search(regular_expression, resume_file)
if bach_major_result:
   bach_major_result = bach_major_result.group()
print (bach_major_result)

if bach_major_result is not None:
    bach_major_index = resume_file.find(bach_major_result)
    print(bach_major_index)
    
#minor
regular_expression_two = re.compile(r"minor|Minor", re.IGNORECASE)
minor_result = re.search(regular_expression_two, resume_file)
if minor_result:
   minor_result = minor_result.group()
print (minor_result)

if minor_result is not None:
    minor_index = resume_file.find(minor_result)
    print(minor_index)

#master
regular_expression_three = re.compile(r"Master|master", re.IGNORECASE)
master_major_result = re.search(regular_expression_three, resume_file)
if master_major_result:
   master_major_result = master_major_result.group()
print (master_major_result)

if master_major_result is not None:
    master_major_index = resume_file.find(master_major_result)
    print(master_major_index)
    
#university
regular_expression_four = re.compile(r"university", re.IGNORECASE)
university_major_result = re.search(regular_expression_four, resume_file)
if university_major_result:
   university_major_result = university_major_result.group()
print (university_major_result)

if university_major_result is not None:
    university_major_index = resume_file.find(university_major_result)
    print(university_major_index)

bachelor of science
173
minor
244
master
267
university
115


In [171]:
upper_bound1 = bach_major_index+100
for k, v in dict.items():
   if (bach_major_index < v < upper_bound1):
       updated_majors1.append(k)
       indexes_majors1.append(v)
print(updated_majors1)
print(indexes_majors1)

upper_bound2 = master_major_index+100
for k, v in dict.items():
   if (master_major_index < v < upper_bound2):
       updated_majors2.append(k)
       indexes_majors2.append(v)
print(updated_majors2)
print(indexes_majors2)

upper_bound3 = university_major_index+100
for k, v in dict.items():
   if (university_major_index < v < upper_bound3):
       updated_majors3.append(k)
       indexes_majors3.append(v)
print(updated_majors3)
print(indexes_majors3)

upper_bound4 = minor_index+100
for k, v in dict.items():
   if (minor_index < v < upper_bound4):
       updated_majors4.append(k)
       indexes_majors4.append(v)
print(updated_majors4)
print(indexes_majors4)

['accounting and management', 'accounting', 'management', 'information', 'information system']
[194, 194, 209, 220, 220]
['information', 'information systems', 'management']
[308, 308, 326]
['accounting and management', 'accounting', 'management', 'business']
[194, 194, 209, 144]
['communications', 'management']
[251, 326]


In [172]:
majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4
majors_minors_final_list = list(dedupe(majors_minors_all))
print (majors_minors_final_list)

['information systems', 'accounting and management', 'communications', 'business']


# Rodrigo Pimenta

# Web Scraping

In [173]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

r = requests.get('https://www.mymajors.com/college-majors/')

soup = BeautifulSoup(r.content, "lxml")

In [174]:
print (soup)

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="MyMajors.com" name="author"/>
<link href="../../../v6/css/?type=css&amp;files=bootstrap.min.css,style-edits.css,jquery.zflickrfeed.css,webfont/climacons-font.css,niceforms.css" media="all" rel="stylesheet" type="text/css"/>
<link href="../../../v6/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
<script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
<![endif]-->
<style>.autocomplete-suggestions { border: 1px solid #999; background: #FFF; overflow: auto;}
.autocomplete-suggestion { padding: 2px 5px; white-space: nowrap; overflow: hidden; }
.autocomplete-selected { backgrou

In [175]:
from IPython.display import HTML
type (soup)

letters1 = soup.find_all(class_="leaf")
letters2 = soup.find_all(class_="expanded top")


print (letters1, letters2)

[<li class="leaf"><a href="http://www.unity.edu/Academic/Majors/WildlifeCareAndEducation/WildlifeCareAndEducation.aspx" target="_blank">Captive Wildlife Care and Education( Unity College in Maine )</a> </li>, <li class="leaf"><a href="http://www.greenville.edu/academics/majors_minors/major_minordetail.dot?id=83732">Contemporary Christian Music ( Greenville College)</a></li>, <li class="leaf"><a href="http://thunder1.cudenver.edu/clas/ISMmajor.html">Individually Structured Major  ( University of Colorado Denver)</a></li>, <li class="leaf"><a href="http://www.greenville.edu/academics/majors_minors/major_minordetail.dot?id=102695">Recording Industry ( Greenville College)</a></li>, <li class="leaf"><a href="http://www.unity.edu/Academic/Majors/SustainableDesignAndTechnology.aspx" target="_blank">Sustainable Design and Technology ( Unity College in Maine )</a></li>, <li class="leaf"><a href="../college-majors/Animal-Training">Animal Training</a></li>, <li class="leaf"><a href="../college-ma

[<li class="expanded top"><a><strong>Featured Majors</strong></a>
<ul class="menu">
<li><a href="aviation-flight-training">Aviation / Flight Training (UND Aerospace) </a></li>
<li class="leaf"><a href="http://www.unity.edu/Academic/Majors/WildlifeCareAndEducation/WildlifeCareAndEducation.aspx" target="_blank">Captive Wildlife Care and Education( Unity College in Maine )</a> </li>
<li class="leaf"><a href="http://www.greenville.edu/academics/majors_minors/major_minordetail.dot?id=83732">Contemporary Christian Music ( Greenville College)</a></li>
<li class="leaf"><a href="http://thunder1.cudenver.edu/clas/ISMmajor.html">Individually Structured Major  ( University of Colorado Denver)</a></li>
<li class="leaf"><a href="http://www.greenville.edu/academics/majors_minors/major_minordetail.dot?id=102695">Recording Industry ( Greenville College)</a></li>
<li class="leaf"><a href="http://www.unity.edu/Academic/Majors/SustainableDesignAndTechnology.aspx" target="_blank">Sustainable Design and Tec

In [176]:
major1 = {}
for element in letters1:
    major1[element.get_text()] = {}
    
major2 = {}
for element in letters2:
    major2[element.get_text()] = {}   
    
print (major1)
print (major2)

{'Clinical, Hospital, and Managed Care Pharmacy': {}, 'Financial Planning and Services': {}, 'Inorganic Chemistry': {}, 'Public Health': {}, 'Auctioneering': {}, 'Molecular Physiology': {}, 'Physics': {}, 'Fashion Merchandising': {}, 'Latin Teacher Education': {}, 'Metallurgical Technology/Technician': {}, 'Theology/Theological Studies': {}, 'Plant Molecular Biology': {}, 'Sustainable Design and Technology ( Unity College in Maine )': {}, 'Archives/Archival Administration': {}, 'Ayurvedic Medicine/Ayurveda': {}, 'Uralic Languages, Literatures, and Linguistics': {}, 'Structural Engineering': {}, 'Quality Control Technology/Technician': {}, 'Geography Teacher Education': {}, 'Conservation Biology': {}, 'Meeting and Event Planning': {}, 'Speech Teacher Education': {}, 'Somatic Bodywork': {}, 'Marketing Research': {}, 'Finance': {}, 'Radar Communications and Systems Technology': {}, 'Space Systems Operations': {}, 'Pathology/Experimental Pathology': {}, 'Marine Biology and Biological Ocean

In [177]:
type(major1)
type(major2)

dict

In [178]:
major1.keys()
major2.keys()

dict_keys(['Business Administration, Management and Operations\n\nBusiness Administration and Management\nCustomer Service Management\nE-Commerce/Electronic Commerce\nLogistics, Materials, and Supply Chain Management\nNon-Profit/Public/Organizational Management\nOffice Management and Supervision\nOperations Management and Supervision\nOrganizational Leadership\nProject Management\nPurchasing, Procurement/Acquisitions and Contracts Management\nResearch and Development Management\nRetail Management\nTransportation/Mobility Management\n\n', 'Human Development, Family Studies, and Related Services\n\nAdult Development and Aging\nChild Care and Support Services Management\nChild Care Provider/Assistant\nChild Development\nDevelopmental Services Worker\nFamily and Community Services\nFamily Systems\nHuman Development and Family Studies\n\n', 'Ground Transportation\n\nConstruction/Heavy Equipment/Earthmoving Equipment Operation\nFlagging and Traffic Control\nMobil Crane Operation/Operator\nRa

In [179]:
list_keys1 = [ k for k in major1.keys() ]
list_keys2 = [ k for k in major2.keys() ]

print(list_keys1)
print(list_keys2)

['Clinical, Hospital, and Managed Care Pharmacy', 'Financial Planning and Services', 'Inorganic Chemistry', 'Public Health', 'Auctioneering', 'Molecular Physiology', 'Physics', 'Fashion Merchandising', 'Latin Teacher Education', 'Metallurgical Technology/Technician', 'Theology/Theological Studies', 'Plant Molecular Biology', 'Sustainable Design and Technology ( Unity College in Maine )', 'Archives/Archival Administration', 'Ayurvedic Medicine/Ayurveda', 'Uralic Languages, Literatures, and Linguistics', 'Structural Engineering', 'Quality Control Technology/Technician', 'Geography Teacher Education', 'Conservation Biology', 'Meeting and Event Planning', 'Speech Teacher Education', 'Somatic Bodywork', 'Marketing Research', 'Finance', 'Radar Communications and Systems Technology', 'Space Systems Operations', 'Pathology/Experimental Pathology', 'Marine Biology and Biological Oceanography', 'General Studies', 'Fire Science/Fire-fighting', 'Youth Services/Administration', 'Fiber, Textile and 

In [180]:
list_combine = list_keys1 + list_keys2

print (list_combine)

['Clinical, Hospital, and Managed Care Pharmacy', 'Financial Planning and Services', 'Inorganic Chemistry', 'Public Health', 'Auctioneering', 'Molecular Physiology', 'Physics', 'Fashion Merchandising', 'Latin Teacher Education', 'Metallurgical Technology/Technician', 'Theology/Theological Studies', 'Plant Molecular Biology', 'Sustainable Design and Technology ( Unity College in Maine )', 'Archives/Archival Administration', 'Ayurvedic Medicine/Ayurveda', 'Uralic Languages, Literatures, and Linguistics', 'Structural Engineering', 'Quality Control Technology/Technician', 'Geography Teacher Education', 'Conservation Biology', 'Meeting and Event Planning', 'Speech Teacher Education', 'Somatic Bodywork', 'Marketing Research', 'Finance', 'Radar Communications and Systems Technology', 'Space Systems Operations', 'Pathology/Experimental Pathology', 'Marine Biology and Biological Oceanography', 'General Studies', 'Fire Science/Fire-fighting', 'Youth Services/Administration', 'Fiber, Textile and 

In [181]:
df = pd.DataFrame(list_combine)
#print (df)

In [182]:
writer = pd.ExcelWriter('majors.xlsx')
df.to_excel(writer, index = False)
writer.save()

In [183]:
#updated excel with split "/"
new_list=[[]]

for i, element in enumerate(list_combine):
   #print (type(element))
   list_combine[i] = element.split('/')
      
for i, element in enumerate(list_combine):
   if (len(list_combine[i])== 2):
       #Given list of 2 strings
       #Get first string in list
       first_str = list_combine[i][0]
       #Get second string in list
       second_str =list_combine[i][1]
       #Cut last word of first string
       updated_str = first_str.rsplit(' ', 1)[0]
       #Combine updated_str with second string
       appended_str = updated_str + ' ' + second_str
       #Add current element from list_combine to new_list
       new_list.append(list_combine[i])
       #Make appended_str a list
       temp_list = [appended_str]
       #Add [appended_str] to new_list
       new_list.append(temp_list)
   else:
       #If the current element is not a list of two strings, just add it to the new_list.
       #No need to add new element we get above.
       new_list.append(list_combine[i])
#print(new_list)


#Flatten new_list
flattened = [val for sublist in new_list for val in sublist]
print(flattened)


['Clinical, Hospital, and Managed Care Pharmacy', 'Financial Planning and Services', 'Inorganic Chemistry', 'Public Health', 'Auctioneering', 'Molecular Physiology', 'Physics', 'Fashion Merchandising', 'Latin Teacher Education', 'Metallurgical Technology', 'Technician', 'Metallurgical Technician', 'Theology', 'Theological Studies', 'Theology Theological Studies', 'Plant Molecular Biology', 'Sustainable Design and Technology ( Unity College in Maine )', 'Archives', 'Archival Administration', 'Archives Archival Administration', 'Ayurvedic Medicine', 'Ayurveda', 'Ayurvedic Ayurveda', 'Uralic Languages, Literatures, and Linguistics', 'Structural Engineering', 'Quality Control Technology', 'Technician', 'Quality Control Technician', 'Geography Teacher Education', 'Conservation Biology', 'Meeting and Event Planning', 'Speech Teacher Education', 'Somatic Bodywork', 'Marketing Research', 'Finance', 'Radar Communications and Systems Technology', 'Space Systems Operations', 'Pathology', 'Experim

In [184]:
flattened = flattened + ['information systems'] + ['information system'] + ['ELECTRONIC TECHNOLOGY']

df2 = pd.DataFrame(flattened)
writer = pd.ExcelWriter('majors2.xlsx')
df2.to_excel(writer, index = False)
writer.save()

# Majors

In [185]:
import pandas

df = pandas.read_excel('majors4.xlsx')

df.columns
major_file = df['Majors'].values
#major_file = flattened
major_lower = [item.lower() for item in major_file]
print (major_lower)

#resume_path = r"/Users/yiyangzhou/Desktop/Zeng_Leah_Resume.txt"
resume_path = os.path.join('testing','example.txt')
#resume_path = r"/Users/yiyangzhou/Desktop/Wang_YilinResume.txt"
#resume_path = r"/Users/yiyangzhou/Desktop/Ryan Holder Resume.txt"
#resume_path = r"/Users/yiyangzhou/Desktop/Shashidhar Karnati.txt"
#resume_path = r"/Users/yiyangzhou/Desktop/all_text1.txt"
resume_file = open(resume_path).read()
resume_file = resume_file.lower()


['captive wildlife care and education( unity college in maine ) ', 'contemporary christian music ( greenville college)', 'individually structured major  ( university of colorado denver)', 'recording industry ( greenville college)', 'sustainable design and technology ( unity college in maine )', 'animal training', 'dog', 'pet', 'animal grooming', 'equestrian', 'equine studies', 'equestrian equine studies', 'taxidermy', 'taxidermist', 'taxidermy taxidermist', 'agricultural and food products processing', 'agribusiness', 'agricultural business operations', 'agribusiness agricultural business operations', 'agricultural business and management', 'agricultural business technology', 'agricultural economics', 'agricultural', 'farm supplies retailing and wholesaling', 'agricultural farm supplies retailing and wholesaling', 'farm', 'farm and ranch management', 'farm farm and ranch management', 'agricultural mechanics and equipment', 'machine technology', 'agricultural mechanics and machine techno

In [186]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams

tokenizer = RegexpTokenizer(r'\w+')
resume_token = tokenizer.tokenize(resume_file) 

def get_bigrams(input):
    n = 2
    result = []
    bigrams = ngrams(input, n)
    for grams in bigrams:
        x = "%s %s" % grams
        result.append(x)
    return (result)    
    print (result)


test1 = get_bigrams(resume_token)
#print (test1)

def get_threegrams(input):
    n = 3
    result = []
    threegrams = ngrams(input, n)
    for grams in threegrams:
        x = "%s %s %s" % grams
        result.append(x)
    return (result)    
    print (result)

test2 = get_threegrams(resume_token)
#print (test2)

def extract_majors(a,b):
    majors=[]
    for x in a:
        if x in b:
            majors.append(x)
    return (majors)
    print (majors)
    

unigram_major = extract_majors(resume_token, major_lower)  
bigram_major = extract_majors(get_bigrams(resume_token), major_lower)
threegram_major = extract_majors(get_threegrams(resume_token), major_lower)


#print (unigram_major)
#print (bigram_major)
#print (threegram_major)


combined_majors_list = unigram_major + bigram_major + threegram_major

major_distinct = []
for i in combined_majors_list:
    if i not in major_distinct:
        major_distinct.append(i)
major_distinct.remove('international')
#print (combined_majors_list)
print (major_distinct)

['farm', 'education', 'accounting', 'management', 'communications', 'information', 'web', 'business', 'service', 'cultural', 'art', 'writing', 'painting', 'history', 'information systems', 'international business', 'accounting and management']


In [187]:
dict = {'Name': 5}
for i, element in enumerate(major_distinct):
   x = resume_file.find(element)
   dict[element] = x
   

del dict['Name']
print(dict)

{'painting': 2473, 'international business': 2620, 'writing': 2446, 'web': 776, 'farm': 31, 'history': 2656, 'accounting and management': 194, 'accounting': 194, 'business': 1026, 'cultural': 1515, 'education': 104, 'information': 308, 'communications': 251, 'art': 2438, 'service': 1228, 'information systems': 308, 'management': 209}


In [188]:
import re
import string

regular_expression = re.compile(r"/BA|BS|Bachelor of Science|Bachelor of Arts|BBA |B/A|Bachelor of Business Administration/", re.IGNORECASE)
bach_major_result = re.search(regular_expression, resume_file)
if bach_major_result:
   bach_major_result = bach_major_result.group()
print (bach_major_result)

if bach_major_result is not None:
    bach_major_index = resume_file.find(bach_major_result)
    print(bach_major_index)

bachelor of science
173


In [189]:
import re
import string

regular_expression_two = re.compile(r"minor|Minor", re.IGNORECASE)
minor_result = re.search(regular_expression_two, resume_file)
if minor_result:
   minor_result = minor_result.group()
print (minor_result)

if minor_result is not None:
    minor_index = resume_file.find(minor_result)
    print(minor_index)

minor
244


In [190]:
import re
import string

regular_expression_three = re.compile(r"Master|master", re.IGNORECASE)
master_major_result = re.search(regular_expression_three, resume_file)
if master_major_result:
   master_major_result = master_major_result.group()
print (master_major_result)


if master_major_result is not None:
    master_major_index = resume_file.find(master_major_result)
    print(master_major_index)

master
267


In [191]:
import re
import string

regular_expression_four = re.compile(r"university", re.IGNORECASE)
university_major_result = re.search(regular_expression_four, resume_file)
if university_major_result:
   university_major_result = university_major_result.group()
print (university_major_result)

if university_major_result is not None:
    university_major_index = resume_file.find(university_major_result)
    print(university_major_index)

university
115


In [192]:
upper_bound = bach_major_index+50

updated_majors1 = []
indexes_majors1 = []
for k, v in dict.items():
   if (bach_major_index < v < upper_bound):
       updated_majors1.append(k)
       indexes_majors1.append(v)

print(updated_majors1)
print(indexes_majors1)

['accounting and management', 'accounting', 'management']
[194, 194, 209]


In [193]:
upper_bound = master_major_index+100

updated_majors2 = []
indexes_majors2 = []
for k, v in dict.items():
   if (master_major_index < v < upper_bound):
       updated_majors2.append(k)
       indexes_majors2.append(v)

print(updated_majors2)
print(indexes_majors2)

['information', 'information systems']
[308, 308]


In [194]:
upper_bound = university_major_index+100

updated_majors3 = []
indexes_majors3 = []
for k, v in dict.items():
   if (university_major_index < v < upper_bound):
       updated_majors3.append(k)
       indexes_majors3.append(v)

print(updated_majors3)
print(indexes_majors3)

['accounting and management', 'accounting', 'management']
[194, 194, 209]


In [195]:
upper_bound = minor_index+50

updated_majors4 = []
indexes_majors4 = []
for k, v in dict.items():
   if (minor_index < v < upper_bound):
       updated_majors4.append(k)
       indexes_majors4.append(v)

print(updated_majors4)
print(indexes_majors4)

['communications']
[251]


In [196]:
#getting distinct resume major
from fuzzywuzzy.process import dedupe

majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4

majors_minors_final_list = dedupe(majors_minors_all)
print (majors_minors_final_list)

dict_keys(['information systems', 'accounting and management', 'communications'])


# Categorize Majors with Universities

In [197]:
universities_list = combined_university_extraction

In [198]:
#universities_list -> List of all extracted universities
#majors_minors_final_list -> List of all majors, masters, and minors
#universities_list = []
#majors_minors_final_list = []


universities_ind_list = [] #List of dictionaries
majors_dict = {'Name': 5} #Dictionary

#Create universities_ind_list with all universities matched to their index
for i, element in enumerate(universities_list):
    x = resume_file.find(element)
    temp = {element: x}
    universities_ind_list.append(temp)
    del temp
print(universities_ind_list)


##Order universities_ind_list by index, using bubble sort.
#alist = []
##my_range(start, end, step)
##for passnum in range(len(universities_ind_list)-1,0,-1):
#for passnum in range(len(universities_ind_list),0,-1):
    #for i in range(passnum):
        #alist[i] = list(universities_ind_list[i].values())[0]
        #alist[i+1] = list(universities_ind_list[i+1].values())[0]
        #if alist[i]>alist[i+1]:
            ##temp = alist[i]
            #temp = universities_ind_list[i]
            ##alist[i] = alist[i+1]
            #(universities_ind_list[i]) = (universities_ind_list[i+1])
            ##alist[i+1] = temp
            #(universities_ind_list[i+1]) = temp
#print(universities_ind_list)


#Create majors_dict with all majors matched to their index
for i, element in enumerate(majors_minors_final_list):
    x = resume_file.find(element)
    majors_dict[element] = x
del majors_dict['Name']
print(majors_dict)



[{'university of maryland': 115}, {'university of minnesota': -1}]
{'accounting and management': 194, 'communications': 251, 'information systems': 308}


In [199]:
#Create final_list
final_list = [{'Univ1': ['major1','major2']},{'Univ2': ['major1','major2']}]
print (final_list)
del final_list[0]
del final_list[0]
print (final_list)

[{'Univ1': ['major1', 'major2']}, {'Univ2': ['major1', 'major2']}]
[]


In [200]:
universities_ind_list_len = len(universities_ind_list)
#0 Universities
if (not universities_list) or (universities_ind_list_len == 0):
    print("Universities list is empty")

#1 University
if (universities_ind_list_len == 1):
    #Then associate all majors in 150 char range, with this University 
    first_index = list(universities_ind_list[0].values())[0]
    
    #Get list of majors within the range
    first_majors = []
    lower_bound = first_index
    upper_bound = first_index+150
    for k, v in majors_dict.items():
        if (lower_bound < v < upper_bound):
            first_majors.append(k)
            
    #Associate list with single university in final_list
    first_university = list(universities_ind_list[0].keys())[0]
    #temp = {'Univ1': ['major1','major2']}
    #del temp['Univ1']
    #temp[first_university] = first_majors
    temp = {first_university: first_majors}
    final_list.append(temp)
    print(final_list)



In [201]:
#2 Universities
if (universities_ind_list_len == 2):
    #Then associate all majors in between the two Universities with the first University.
    #And associate all majors in 150 char range of the second University with the second University.
    first_index = list(universities_ind_list[0].values())[0]
    second_index = list(universities_ind_list[1].values())[0]
    len_until_next = (second_index - first_index)
    
    first_majors = []
    second_majors = []
    
    #Get list of majors within the two Universities
    lower_bound = first_index
    upper_bound = first_index+len_until_next
    for k, v in majors_dict.items():
        if (lower_bound < v < upper_bound):
            first_majors.append(k)
    
    #Get list of majors within 150 char range of the second University.
    lower_bound = second_index
    upper_bound = second_index+150
    for k, v in majors_dict.items():
        if (lower_bound < v < upper_bound):
            second_majors.append(k)
            
    #Associate lists with Universities
    first_university = list(universities_ind_list[0].keys())[0]
    second_university = list(universities_ind_list[1].keys())[0]
    temp1 = {first_university: first_majors}
    temp2 = {second_university: second_majors}
    final_list.append(temp1)
    final_list.append(temp2)
    print(final_list)



[{'university of maryland': []}, {'university of minnesota': []}]


In [202]:
#3 or more Universities
#Iterate through universities_ind_list
if (universities_ind_list_len > 2):
    for i in range(0, (universities_ind_list_len-2)):
        first_index = list(universities_ind_list[i].values())[0]
        second_index = list(universities_ind_list[i+1].values())[0]
        len_until_next = (second_index - first_index)

        first_majors = []
        second_majors = []

        #Get list of majors within the two Universities
        lower_bound = first_index
        upper_bound = first_index+len_until_next
        for k, v in majors_dict.items():
            if (lower_bound < v < upper_bound):
                first_majors.append(k)

        if ((i+1) == (universities_ind_list_len-1)):
        #if (i == (universities_dict_len-2))
            #Get majors of last University on the list
            lower_bound = second_index
            upper_bound = second_index+150
            for k, v in majors_dict.items():
                if (lower_bound < v < upper_bound):
                    second_majors.append(k)
        #Else continue loop until end
        
        #Associate lists with Universities.
        first_university = list(universities_ind_list[i].keys())[0]
        temp1 = {first_university: first_majors}
        final_list.append(temp1)
        if (second_majors):
        #If second_majors list is not empty, add it to final_list with its University.
            second_university = list(universities_ind_list[i+1].keys())[0]
            temp2 = {second_university, second_majors}
            final_list.append(temp2)
    print(final_list)

In [203]:
print("Bachelors:", updated_majors1)
print("Masters:", updated_majors2)
print("Minors:", updated_majors4)
print("Universities: ",universities_list)
print("Majors and Universities Categorized: ",final_list)

Bachelors: ['accounting and management', 'accounting', 'management']
Masters: ['information', 'information systems']
Minors: ['communications']
Universities:  {'university of maryland', 'university of minnesota'}
Majors and Universities Categorized:  [{'university of maryland': []}, {'university of minnesota': []}]


# Henry Dao

# 1. Extract Address

In [204]:
import re
import usaddress

#Extract the address from resume file 
def extract_address (text):
    text = text.replace('\n', ' ')
    regex = re.compile(r"[0-9]+ .*[.,-]? .*[.,-]? ([A-Z]{2}|\w+)[.,-]? [0-9]{5}(-[0-9]{4})?")
    result = re.search(regex, text)
    if result:
        result = result.group()
    return result

#Parse the components
def parse_address(result):
    address = usaddress.tag(result)
    return address


# ========Testing========

In [205]:

text = []
text.append('''Hi, Mr. Sam D. Richards lives here, 44 West 22nd Street, New York, NY 12345-4567. 
Can you contact him now? If you need any help, call me on 123 456 7891''')
text.append(''' ABEBAW AYELE
6040 14th St NW Washington DC 20011
202-629-7212 	abex72@gmail.com''')
text.append('''Amanda Yu
    	                       9700 Skyhill Way· Rockville· MD 20850·301-502-8705·yubo0107@hotmail.com''')
text.append('''Miguel Lorenzo M. Aviles
1644 New Windsor Ct
Crofton, Maryland 21114
(703) 501-1932
maviles@umd.edu
''')
text.append('''Alexander Berger
3711 Campus Drive
College Park, MD 20742
240-338-2206
alexfberger@gmail.com
Objective
I am seeking an entry-level position where I can use my design and software skills to provide better and more intuitive application design for customers and for my team.
''')

In [206]:
#Extracting Address
address=[]
for i in range(len(text)):
    address.append(extract_address(text[i])) 
    print(address[i])

44 West 22nd Street, New York, NY 12345-4567
6040 14th St NW Washington DC 20011
9700 Skyhill Way· Rockville· MD 20850
1644 New Windsor Ct Crofton, Maryland 21114
3711 Campus Drive College Park, MD 20742


In [207]:
#Parse the component
address_components = []
for i in range(len(text)):
    address_components.append(parse_address(address[i]))
    print("Person {0}: {1}".format(i+1, parse_address(address[i])))

Person 1: (OrderedDict([('AddressNumber', '44'), ('StreetNamePreDirectional', 'West'), ('StreetName', '22nd'), ('StreetNamePostType', 'Street'), ('PlaceName', 'New York'), ('StateName', 'NY'), ('ZipCode', '12345-4567')]), 'Street Address')
Person 2: (OrderedDict([('AddressNumber', '6040'), ('StreetName', '14th'), ('StreetNamePostType', 'St'), ('StreetNamePostDirectional', 'NW'), ('PlaceName', 'Washington'), ('StateName', 'DC'), ('ZipCode', '20011')]), 'Street Address')
Person 3: (OrderedDict([('AddressNumber', '9700'), ('StreetName', 'Skyhill'), ('StreetNamePostType', 'Way·'), ('PlaceName', 'Rockville·'), ('StateName', 'MD'), ('ZipCode', '20850')]), 'Street Address')
Person 4: (OrderedDict([('AddressNumber', '1644'), ('StreetName', 'New Windsor'), ('StreetNamePostType', 'Ct'), ('PlaceName', 'Crofton'), ('StateName', 'Maryland'), ('ZipCode', '21114')]), 'Street Address')
Person 5: (OrderedDict([('AddressNumber', '3711'), ('StreetName', 'Campus'), ('StreetNamePostType', 'Drive'), ('Place

In [208]:
#Print the result
for i in range(len(address_components)):
    person = list(address_components[i][0].items())
    Address=''
    for j, item in enumerate(person):
        if person[j][0] == 'PlaceName':
            Placename = person[j][1]
        elif person[j][0] == 'StateName':
            State = person[j][1]
        elif person[j][0] == 'ZipCode':
            ZipCode = person[j][1]
        else:
            Address += person[j][1] + ' '

    print('\nPerson {}'.format(i+1))
    print("Address: {}".format(Address))
    print("City: {}".format(Placename))
    print("State: {}".format(State))
    print("ZipCode: {}".format(ZipCode))


Person 1
Address: 44 West 22nd Street 
City: New York
State: NY
ZipCode: 12345-4567

Person 2
Address: 6040 14th St NW 
City: Washington
State: DC
ZipCode: 20011

Person 3
Address: 9700 Skyhill Way· 
City: Rockville·
State: MD
ZipCode: 20850

Person 4
Address: 1644 New Windsor Ct 
City: Crofton
State: Maryland
ZipCode: 21114

Person 5
Address: 3711 Campus Drive 
City: College Park
State: MD
ZipCode: 20742


# Extract Skills 

In [209]:
import nltk
import pandas as pd
import os
import codecs

data = pd.read_excel("Skills.xlsx", header=0)
skill_list = list(data['Skill Names'])
skill_list = set(skill_list)
skill_list= [skill.lower() for skill in skill_list]
sorted(skill_list[0:10])

['deployment of cloud services',
 'fda 21 cfr part 11',
 'glimmerhmm',
 'hp load runner',
 'ibm db2',
 'programmers',
 'rescue diving',
 'riverbed modeler',
 'sqoop',
 'webdav']

In [210]:
import docx2txt
filename ='all_text1.txt'
trained_resume_path = os.path.join('Trained Resumes', filename)

In [211]:
#resume_text = docx2txt.process(test_resume_path)
resume_text = open(trained_resume_path, 'r', encoding='utf_8').read()

In [212]:
from nltk.corpus import stopwords

special_characters = ['!','#', '$', '%','&','*','-', '/', '=','?',
                      '^','.','_','`', '{', '|', '}','~', "'", ',', '(',')', ':', '•', '§' ]

In [213]:
# Processing text 

def resume_processing (resume_text):
    #tokenize sentences
    resume_sents = nltk.sent_tokenize(resume_text)

    #tokenize words
    resume_words = [nltk.word_tokenize(sent) for sent in resume_sents]
    
    #remove stopwords and special characters
    processed_resume=[]
    for sentence in resume_words:
        sent = [w.lower() for w in sentence 
                          if w.lower() not in stopwords.words('english') and w.lower() not in special_characters]
        processed_resume.append(sent)
    
    return processed_resume

In [214]:
unigram_resume = resume_processing(resume_text)
unigram_resume

[['jee',
  'kang',
  '3711',
  'campus',
  'drive',
  'apt',
  '139',
  'college',
  'park',
  'md',
  '4439447487',
  'jwk1151',
  '@',
  'gmail.com',
  'education',
  'university',
  'maryland',
  'college',
  'park',
  'dual',
  'degree',
  'cumulative',
  'gpa',
  '3.7',
  'anticipated',
  'fall',
  '2018',
  'b.s'],
 ['physiology', 'neurobiology', 'b.s'],
 ['computer',
  'science',
  'relevant',
  'courses',
  'object',
  'oriented',
  'programming',
  'ii',
  'introduction',
  'computer',
  'systems',
  'discrete',
  'structures',
  'innovators',
  'think'],
 ['languages',
  'technologies',
  'languages',
  'java',
  'c',
  'programs',
  'eclipse',
  'android',
  'studio',
  'matlab',
  'latex',
  'labchart',
  'projects',
  'projects',
  'found',
  'github',
  'github.com/jeekang',
  'turnt',
  'tower',
  '2016',
  'independently',
  'developed',
  'android',
  'app',
  'augment',
  'block',
  'tower',
  'game',
  'jenga'],
 ['app', 'features', 'attractive', 'user-friendly', 'in

In [215]:
from gensim.models import Phrases

#Create bigram model
bigram_model_path = 'bigram_model'

bigram_model = Phrases(unigram_resume)
bigram_model.save(bigram_model_path)

In [216]:
# Create bigram words
def create_bigram (unigram_resume):
    bigram_model = Phrases.load(bigram_model_path)
    bigram_resume = [bigram_model[sentence] for sentence in unigram_resume]
    return bigram_resume

In [217]:
bigram_resume = create_bigram(unigram_resume)



In [218]:
#Create trigram model 
trigram_model_path = 'trigram_model'

trigram_model = Phrases(bigram_resume)
trigram_model.save(trigram_model_path)

In [219]:
# Create trigram words
def create_trigram (bigram_resume):
    trigram_model = Phrases.load(trigram_model_path)
    trigram_resume = [trigram_model[sentence] for sentence in bigram_resume]
    return trigram_resume

In [220]:
trigram_resume = create_trigram(bigram_resume)
trigram_resume



[['jee',
  'kang',
  '3711',
  'campus',
  'drive',
  'apt',
  '139',
  'college_park_md',
  '4439447487',
  'jwk1151',
  '@_gmail.com_education_university',
  'maryland_college_park',
  'dual',
  'degree',
  'cumulative_gpa',
  '3.7',
  'anticipated',
  'fall',
  '2018',
  'b.s'],
 ['physiology', 'neurobiology', 'b.s'],
 ['computer_science',
  'relevant_courses',
  'object_oriented_programming',
  'ii',
  'introduction_computer_systems',
  'discrete_structures',
  'innovators',
  'think'],
 ['languages',
  'technologies',
  'languages_java',
  'c',
  'programs',
  'eclipse',
  'android_studio',
  'matlab',
  'latex',
  'labchart',
  'projects',
  'projects',
  'found',
  'github',
  'github.com/jeekang',
  'turnt',
  'tower',
  '2016',
  'independently',
  'developed',
  'android',
  'app',
  'augment',
  'block',
  'tower',
  'game',
  'jenga'],
 ['app', 'features', 'attractive', 'user-friendly', 'interface'],
 ['language',
  'java',
  'android',
  'java-based',
  'dna',
  'template'

In [221]:
import re

#Normalize bigram/trigram words 
def normalize_words (trigram_resume):
    for sentence in trigram_resume:
        for i, word in enumerate(sentence):   
            if len(re.findall(r'\w+\_\w+', word))!= 0:
                sentence[i] = re.sub('_', ' ', word)
    return trigram_resume

In [222]:
normalized_resume = normalize_words(trigram_resume)

In [223]:
#label skills in the resume
def labeled_word (sentence):
    labels=[]
    for word in sentence:
        if word in skill_list:
            labels.append((word, 'skill'))
        else:
            labels.append((word, 'not skill'))
    return labels

In [224]:
labeled_words=[labeled_word(sentence) for sentence in normalized_resume]
labeled_words

[[('jee', 'not skill'),
  ('kang', 'not skill'),
  ('3711', 'not skill'),
  ('campus', 'not skill'),
  ('drive', 'not skill'),
  ('apt', 'not skill'),
  ('139', 'not skill'),
  ('college park md', 'not skill'),
  ('4439447487', 'not skill'),
  ('jwk1151', 'not skill'),
  ('@ gmail.com education university', 'not skill'),
  ('maryland college park', 'not skill'),
  ('dual', 'not skill'),
  ('degree', 'not skill'),
  ('cumulative gpa', 'not skill'),
  ('3.7', 'not skill'),
  ('anticipated', 'not skill'),
  ('fall', 'not skill'),
  ('2018', 'not skill'),
  ('b.s', 'not skill')],
 [('physiology', 'not skill'),
  ('neurobiology', 'not skill'),
  ('b.s', 'not skill')],
 [('computer science', 'not skill'),
  ('relevant courses', 'not skill'),
  ('object oriented programming', 'skill'),
  ('ii', 'not skill'),
  ('introduction computer systems', 'not skill'),
  ('discrete structures', 'not skill'),
  ('innovators', 'not skill'),
  ('think', 'not skill')],
 [('languages', 'not skill'),
  ('techn

In [246]:
def similar_prob(word):
    count = 0
    terms = get_related_terms(word,25)
    for w in terms:
        if w in skill_list:
            count+=1
    return count/25

In [255]:
def in_skill_cluster(word):
    if word in skill_list:
        return True
    return False

In [256]:
def get_related_terms(token,topn):
    arr =[]
    for word,similar in res2vec.wv.most_similar(positive = [token],topn=topn):
        #print(word,':',round(similar,3))
        arr.append(word)
    return arr

In [257]:
#extract featurres of skills 
def extract_features (sentence, i):
    features={}
    #first feature: evaluate if that word is in skill list
    features["({})in_skill_list".format(sentence[i])]= (sentence[i] in skill_list)
    
    if sentence[i] in res2vec.wv.vocab:
        features["probality_of_similar_words_skills"] = similar_prob(sentence[i])
        features["in_skill_cluster"] = in_skill_cluster(sentence[i])
    
    #if the word is in begining of the sentence, return <Start> for prev_word
    if i==0 and len(sentence)-1 != 0:
        features["prev_word_in_skill_list"]= '<Start>'
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)
    
    #if the word is in begining of the sentence, return <End> for next_word
    elif i == len(sentence)-1 and  i != 0:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= '<End>'
    
    #if the sentence has only 1 word, return False for both prev_word and next_word
    elif i==0 and len(sentence)-1 == 0:
        features["prev_word_in_skill_list"]= False
        features["next_word_in_skill_list"]= False
    else:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)
    return features

In [258]:
%%time
featuresets=[]
for labeled_sent in labeled_words:
    unlabeled_sent = [word[0] for word in labeled_sent]
    for i, (w, label) in enumerate(labeled_sent):
        featuresets.append((extract_features(unlabeled_sent, i), label)) 

CPU times: user 5min 20s, sys: 604 ms, total: 5min 20s
Wall time: 8min 53s


In [259]:
#Save the features in a file
featuresets_file = 'features_file.txt'
file = open(featuresets_file, 'w', encoding='utf_8')
file.write('\n'.join('%s %s' % item for item in featuresets ))

42642857

In [260]:
size = int(len(featuresets)*0.1)
train_set = featuresets[size:]
test_set = featuresets[:size]

In [261]:
train_set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [262]:
nltk.classify.accuracy(classifier, test_set)

0.9984589418067072

In [263]:
classifier.show_most_informative_features(15)

Most Informative Features
probality_of_similar_words_skills = 0.52            skill : not sk =     75.3 : 1.0
probality_of_similar_words_skills = 0.56            skill : not sk =     44.2 : 1.0
probality_of_similar_words_skills = 0.6             skill : not sk =     17.5 : 1.0
probality_of_similar_words_skills = 0.48            skill : not sk =     17.1 : 1.0
probality_of_similar_words_skills = 0.28            skill : not sk =      8.2 : 1.0
probality_of_similar_words_skills = 0.24            skill : not sk =      8.2 : 1.0
probality_of_similar_words_skills = 0.44            skill : not sk =      8.0 : 1.0
probality_of_similar_words_skills = 0.36            skill : not sk =      7.5 : 1.0
probality_of_similar_words_skills = 0.4             skill : not sk =      6.7 : 1.0
probality_of_similar_words_skills = 0.2             skill : not sk =      4.7 : 1.0
probality_of_similar_words_skills = 0.64            skill : not sk =      4.0 : 1.0
probality_of_similar_words_skills = 0.32          

# =========Testing=========

In [264]:
test_file =['sampleMechanical Engineering Resume.txt', 'desktop support engineer resume.txt','Henrydao -Resume.txt',
            'Electrical Engineering Student Resume.txt','Technical Consultant Resume.txt','Technical Manager Resume.txt',
           'Technical Support Resume.txt', 'Technical Writer Resume.txt', 'Yiyang (Eric) Zhou Resume 2017 Spring.txt']

def extract_skills(normalized_test_res, resume_number, filename):
    skills =[]
    for sent in normalized_test_res:
        for (i,_) in enumerate(sent):
            if classifier.classify(extract_features(sent, i))=='skill':
                skills.append(sent[i])
                extracted_skills = set(skills)
    print('\nResume {}:{} ({} skills)\n'.format(resume_number+1,filename, len(extracted_skills)), extracted_skills)
    
for i, filename in enumerate(test_file):
    test_resume_path= os.path.join('Test Resumes', filename)

    test_resume = open(test_resume_path, 'r').read()
    unigram_test_res = resume_processing(test_resume)
    bigram_test_res = create_bigram(unigram_test_res)
    trigram_test_res = create_trigram(bigram_test_res)
    normalized_test_res = normalize_words(trigram_test_res)
    extract_skills(normalized_test_res, i, filename)




Resume 1:sampleMechanical Engineering Resume.txt (16 skills)
 {'programming', 'ms excel', 'fortran', 'technology', 'graphics', 'unix', 'windows', 'design', 'testing', 'c++', 'engineering', 'autocad', 'spring', 'ms word', 'software', 'matlab'}

Resume 2:desktop support engineer resume.txt (18 skills)
 {'technical', 'software', 'email', 'verbal', 'technical support', 'escalations', 'data', 'ethernet', 'routing', 'voip', 'innovative', 'networking', 'installation', 'reporting', 'troubleshooting', 'tcp/ip', 'excel', 'networks'}

Resume 3:Henrydao -Resume.txt (35 skills)
 {'sdlc', 'scrum', 'mac os x', 'design', 'tableau', 'lucidchart', 'data analysis', 'marketing', 'social media', 'precisiontree', 'rational rose', 'entrepreneurship', 'arena', 'business analyst', 'sql', 'uml', 'software', 'advanced excel', 'minitab', 'training', 'palisade', 'technology', 'html css', 'data science', 'project', 'programming', 'stattool', 'data', 'solvertable', 'ms visio', 'r python', 'reporting', 'healthcare',

# 3. Extract Companies

In [265]:
import codecs
import os

filename = 'BrandonThomasResume.txt'
#Open file
def open_file(filename):
    resume = open(filename, 'r', errors='ignore').read()
    return resume

In [266]:
resume = open_file(filename)
print(resume)

﻿Brandon Thomas
5514 First St NW Apt# 203 Washington D.C., 20011
• Mobile: 202-520-1835 • Email: brandonthomas280@gmail.com


Objective: My goal is to become employed with a company in the technology industry where I can utilize my IT & Computer Science skills and gain to continue to learn new skills while enhancing the company’s productivity and reputation.

WORK EXPERIENCE:
Encentric Inc., Fairfax, VA								   	  (April 2016 – June 2016)
Help Desk Technician
• Disconnect work stations and bag them properly for moving. 
• Reconnect workstations at new work-space according to end-user specifications.

Levy Restaurant at Nationals Park, Washington, D.C  						      (April 2015 – Present)
Server											       
• Attend to customers during events, interact and answer questions.
• Set up, serve, and break down all event food tables.

U.S. Department of the Treasury- Internal Revenue Service (IRS), Washington, D.C.			 (June 2012- August 2012)
Student Intern-Management and Staff Assistan

In [267]:
#Import different put of experience headers
data = pd.read_excel("Work Experience.xlsx", header=0)
experience_list = list(data['Example'])
experience_list

['ADDITIONAL EXPERIENCE',
 'Additional Experience',
 'ARMY EXPERIENCE',
 'Army Experience',
 'CAREER RELATED EXPERIENCE',
 'Career Related Experience',
 'EMPLOYMENT HISTORY',
 'Employment History',
 'EXPERIENCE',
 'Experience',
 'FREELANCE',
 'Freelance',
 'FREELANCE EXPERIENCE',
 'Freelance Experience',
 'MILITARY BACKGROUND',
 'Military Background',
 'MILITARY EXPERIENCE',
 'Military Experience',
 'PROFESSIONAL BACKGROUND',
 'Professional Background',
 'PROFESSIONAL EXPERIENCE',
 'Professional Experience',
 'Project',
 'PROJECT',
 'Projects',
 'PROJECTS',
 'Related Experience',
 'RELATED EXPERIENCE',
 'WORK EXPERIENCE',
 'Work Experience',
 'Work History',
 'WORK HISTORY',
 'Professional History',
 'PROFESSIONAL HISTORY']

In [268]:
from fuzzywuzzy.process import dedupe

#Find the experience header
def find_exp_header (resume):
    exp_header_list=[]
    for word in experience_list:
        if resume.find(word) != -1:
            exp_header_list.append(word)
    
    #remove duplicates of experience header
    exp_header = list(dedupe(exp_header_list))
    return exp_header

In [269]:
exp_header = find_exp_header(resume)
exp_header

['WORK EXPERIENCE']

In [270]:
exp_header = (exp_header[0], resume.find(exp_header[0]))
exp_header

('WORK EXPERIENCE', 362)

In [271]:
import re
import itertools

#List of all sections in a typical resume
section_list =['EDUCATION', 'Education', 'Skills', 'SKILLS', 'VOLUNTEER EXPERIENCE', 'Volunteer Experience',
              'Technical Skills', 'TECHNICAL SKILS', 'SUMMARY', 'summary', 'Professional Summary', 'PROFESSIONAL SUMMARY',
              'DEMONSTRATED SKILLS', 'Demonstrated Skills', 'Additional Information', 'ADDITIONAL INFORMATION', 
               'Leadership Experience', 'LEADERSHIP EXPERIENCE', 'REFERENCES', 'References', 
               'Certificates & Trainings', 'CERTIFICATE & TRAININGS', 'TRAINING', 'Training', 'Certificate', 'CERTIFICATE', 
               'RELEVANT COURSES', 'LANGUAGES', 'Relevant Courses', 'Languages', 'LEADERSHIP AND VOLUNTEER EXPERIENCE',
               'Leadership and Volunteer Experience', 'LEADERSHIP & VOLUNTEER EXPERIENCE', 'Leadership & Volunteer Experience',
               'EDUCATION AND TRAINING', 'Education and Training', 'Key Projects', 'KEY PROJECTS', 'RELEVANT ACADEMIC PROJECTS', 
               'Relevant Academic Projects', 'ACADEMIC PROJECTS', 'Academic Projects', 'EXTRACURRICULAR ACTIVITIES', 
               'Extracurricular Activities'
              ]

In [272]:
#Find next section header
def find_next_section (resume):
    #Find all capitalized words
    next_section_upper = re.findall(r'([A-Z]{3,}( [A-Z]+)?( [A-Z]+)?( [A-Z]+)?)', 
                                   resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_upper = list((itertools.chain.from_iterable(next_section_upper)))
    
    #Find all words with the first letter capitalized
    next_section_lower = re.findall(r'([A-Z]{1}\w+( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?)',
                                    resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_lower = list((itertools.chain.from_iterable(next_section_lower)))
    
    #Combine into a list
    next_section_list = next_section_upper + next_section_lower
    
    #if one of the items matches items in section list, that item is the next section header
    next_section=()
    for item in next_section_list:
        if item in section_list and (resume[resume.find(item)+len(item)]=='\n' or resume[resume.find(item)-1]=='\n'):
            next_section = (item, resume.find(item))
            break
    return next_section

In [273]:
next_section = find_next_section(resume)
next_section

('EDUCATION', 1808)

In [274]:
def get_workexp_section(resume):
    if next_section:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):next_section[1]])
    else:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):])
    return workexp_section

In [275]:
workexp_section = get_workexp_section(resume)
workexp_section = workexp_section.split('\n')
workexp_section

['',
 'Encentric Inc., Fairfax, VA\t\t\t\t\t\t\t\t   \t  (April 2016 – June 2016)',
 'Help Desk Technician',
 '• Disconnect work stations and bag them properly for moving. ',
 '• Reconnect workstations at new work-space according to end-user specifications.',
 '',
 'Levy Restaurant at Nationals Park, Washington, D.C  \t\t\t\t\t\t      (April 2015 – Present)',
 'Server\t\t\t\t\t\t\t\t\t\t\t       ',
 '• Attend to customers during events, interact and answer questions.',
 '• Set up, serve, and break down all event food tables.',
 '',
 'U.S. Department of the Treasury- Internal Revenue Service (IRS), Washington, D.C.\t\t\t (June 2012- August 2012)',
 'Student Intern-Management and Staff Assistant',
 '• Improved the IRS website by providing recommendations on outdated formats and trends.',
 '• Recognized by team for my dedication for the duration of my internship.',
 '• Keep official records for meetings.',
 '• Sorted, organized, and disposed of documents.',
 '',
 'Paramount Baptist Church

In [276]:
#Remove the detail and get the experience information
def get_exp_info(work_exp):
    company_info=[]
    temp_str=''
    for i, sent in enumerate(work_exp):
        if sent != '':
            #Everything before the bullet will be put into one sentence, for one company
            if not sent.startswith(('•','', u'\uf095', '§', '§')): 
                temp_str += sent + ' '
            else:
                if not work_exp[i-1].startswith(('•','', u'\uf095', '§', '§')):
                    company_info.append(temp_str)
                    temp_str=''
    return company_info

In [277]:
company_info = get_exp_info(workexp_section)
for i, company in enumerate(company_info):
    company = company.replace('\t', '')
    print('\nCompany {}:'.format(i+1), company)


Company 1: Encentric Inc., Fairfax, VA     (April 2016 – June 2016) Help Desk Technician 

Company 2: Levy Restaurant at Nationals Park, Washington, D.C        (April 2015 – Present) Server        

Company 3: U.S. Department of the Treasury- Internal Revenue Service (IRS), Washington, D.C. (June 2012- August 2012) Student Intern-Management and Staff Assistant 

Company 4: Paramount Baptist Church, Washington, D.C.            (Summer 2008, 2009, 2010) Summer Camp Counselor 


In [278]:
import spacy
from nltk.corpus import stopwords
nlp = spacy.load('en')

def extract_exp_info(company_info, filename):
    count = 0
    print(filename)
    for i, sent in enumerate(company_info):
        sent = sent.replace('\t', '')
        parsed_sent = nlp(sent)
        print('\nCompany {}'.format(i+1))
        
        company=''
        location=''
        time=''
        role=''
        for i ,token in enumerate(parsed_sent):
            if token.ent_type_ =='ORG':
                company += ' ' + str(token)
            elif token.ent_type_ =='GPE':
                location += ' ' + str(token)
            elif token.ent_type_ =='DATE' or token.ent_type_ =='TIME':
                time += ' ' + str(token)
            elif token.ent_type_ =='':
                if str(token).isalpha() and str(token) not in stopwords.words('english'):
                    role += ' ' + str(token)
        
        print('Company: {}'.format(company))
        print('Location: {}'.format(location))
        print('Time: {}'.format(time))
        print('Role: {}'.format(role))

In [279]:
extract_exp_info(company_info, filename)

BrandonThomasResume.txt

Company 1
Company:  Encentric Inc. VA
Location:  Fairfax
Time:  April 2016 June 2016
Role:  Help Desk Technician

Company 2
Company:  Levy Restaurant
Location:  Washington D.C
Time:  April 2015
Role:  Nationals Present Server

Company 3
Company:  U.S. Department Internal Revenue Service IRS
Location:  Washington D.C.
Time:  June 2012- August 2012
Role:  Student Intern Management Staff Assistant

Company 4
Company:  Paramount Baptist Church Summer Camp
Location:  Washington D.C.
Time:  Summer 2008 , 2009 , 2010
Role:  Counselor


# John Park

## Labeling all the clusters and training the Naive Bayes model


In [280]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
all_classes = [0,1,2,3]
clf = GaussianNB()

ctemp = np.array([res2vec.wv['issue']])
#cluster 0 - skills
for key,item in word_centroid_map.items():
    if( item == 0 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 1 - skills
ctemp = np.array([res2vec.wv['rule']])
for key,item in word_centroid_map.items():
    if( item == 1 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 2 - skills
ctemp = np.array([res2vec.wv['excel']])
for key,item in word_centroid_map.items():
    if( item == 2 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 3 - Experience
ctemp = np.array([res2vec.wv['lead']])
for key,item in word_centroid_map.items():
    if( item == 3 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 4 - Misc
ctemp = np.array([res2vec.wv['peoplesoft']])
for key,item in word_centroid_map.items():
    if( item == 4 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),2)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 5 - Experience
ctemp = np.array([res2vec.wv['skill']])
for key,item in word_centroid_map.items():
    if( item == 5 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 6 - Skills
ctemp = np.array([res2vec.wv['server']])
for key,item in word_centroid_map.items():
    if( item == 6 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 7 - Experience
ctemp = np.array([res2vec.wv['the']])
for key,item in word_centroid_map.items():
    if( item == 7 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 8 - Experience
ctemp = np.array([res2vec.wv['and']])
for key,item in word_centroid_map.items():
    if( item == 8 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 9 - Experience
ctemp = np.array([res2vec.wv['bi']])
for key,item in word_centroid_map.items():
    if( item == 9 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 10 - Experience
ctemp = np.array([res2vec.wv['employee']])
for key,item in word_centroid_map.items():
    if( item == 10 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 11 - Experience
ctemp = np.array([res2vec.wv['time']])
for key,item in word_centroid_map.items():
    if( item == 11 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 12 - Experience
ctemp = np.array([res2vec.wv['professional']])
for key,item in word_centroid_map.items():
    if( item == 12 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 13 - Experience
ctemp = np.array([res2vec.wv['at']])
for key,item in word_centroid_map.items():
    if( item == 13 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 14 - Experience
ctemp = np.array([res2vec.wv['information']])
for key,item in word_centroid_map.items():
    if( item == 14 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 15 - Education
ctemp = np.array([res2vec.wv['university']])
for key,item in word_centroid_map.items():
    if( item == 15 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),3)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 16 - Misc
ctemp = np.array([res2vec.wv['present']])
for key,item in word_centroid_map.items():
    if( item == 16 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),2)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 17 - Skills
ctemp = np.array([res2vec.wv['java']])
for key,item in word_centroid_map.items():
    if( item == 17 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 18 - Skills
ctemp = np.array([res2vec.wv['java']])
for key,item in word_centroid_map.items():
    if( item == 18 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 19 - Experience
ctemp = np.array([res2vec.wv['employee']])
for key,item in word_centroid_map.items():
    if( item == 19 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 20 - Experience
ctemp = np.array([res2vec.wv['employee']])
for key,item in word_centroid_map.items():
    if( item == 20 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 21 - Skills
ctemp = np.array([res2vec.wv['java']])
for key,item in word_centroid_map.items():
    if( item == 21 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 22 - Skills
ctemp = np.array([res2vec.wv['java']])
for key,item in word_centroid_map.items():
    if( item == 22 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 23 - Experience
ctemp = np.array([res2vec.wv['employee']])
for key,item in word_centroid_map.items():
    if( item == 23 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),1)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 24 - Misc
ctemp = np.array([res2vec.wv['+']])
for key,item in word_centroid_map.items():
    if( item == 24 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),2)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 25 - Skills
ctemp = np.array([res2vec.wv['java']])
for key,item in word_centroid_map.items():
    if( item == 25 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),0)
clf.partial_fit(ctemp,samp,classes=all_classes)

#Cluster 26 - Education
ctemp = np.array([res2vec.wv['human']])
for key,item in word_centroid_map.items():
    if( item == 26 ):
        ctemp=np.vstack([ctemp,res2vec.wv[key]])
samp = np.full(len(ctemp),3)
clf.partial_fit(ctemp,samp,classes=all_classes)

GaussianNB(priors=None)

## Determine category of input word

In [281]:
testWord = 'university'
totest = np.array(res2vec.wv[testWord])
totest = totest.reshape(1,-1)
prediction=clf.predict(totest)
print(prediction)
if prediction[0] == 0:
    print(testWord + " is in SKILLS")
if prediction[0] == 1:
    print(testWord + " is in EXPERIENCE")
if prediction[0] == 2:
    print(testWord + " is in MISC")
if prediction[0] == 3:
    print(testWord + " is in EDUCATION")

[1]
university is in EXPERIENCE


# Organization Extraction

In [283]:
import sys
import codecs
from docx import *
from io import StringIO

location = os.path.join('Resumes','AmandaYuResume.docx')
document = Document(location)
s = ""
for paragraph in document.paragraphs:
    s += str(paragraph.text.encode(errors='ignore'))

# Ashish Aggarwal

In [294]:
# import all headers
import re
import os

In [295]:
# function to extract all URLs
# implemented using regex
def extract_URLs(parsedResume):
    parsedResume = parsedResume.replace('\n', ' ')
    regex = regex = re.compile('(?:(?:https?|ftp|file)://|www\.|ftp\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.IGNORECASE)
    result = re.findall(regex, parsedResume)
    #if result:
        #result = result.group()
    return result

In [296]:
# function to extract LinkedIN Profile
# implemented using regex
def extract_linkedin(parsedResume):
    parsedResume = parsedResume.replace('\n', ' ')
    regex = re.compile(r"https://www.linkedin.com/in/([a-zA-Z]|[0-9]|[-])+/?")
    result = re.search(regex, parsedResume)
    if result:
        result = result.group()
    return result

In [297]:
# TESTING
# path where all resumes are located
test_resume_path = 'Test Resumes'
counter = 0

print("URLs in Test Resumes")
for filename in os.listdir(test_resume_path):
    # print(filename)
    if '.txt' in filename:
        counter = counter + 1
        resume_path= os.path.join('Test Resumes', filename)
        test_resume = open(resume_path, 'r').read()
    
        print("Resume ", (counter), ":")
        print("All URLs => ", extract_URLs(test_resume))
        print("LinkedIn Profiles => ", extract_linkedin(test_resume))

URLs in Test Resumes
Resume  1 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  2 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  3 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  4 :
All URLs =>  ['https://www.linkedin.com/in/jared-ross-15439b83/', 'https://github.com/JaredJRoss/']
LinkedIn Profiles =>  https://www.linkedin.com/in/jared-ross-15439b83/
Resume  5 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  6 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  7 :
All URLs =>  ['http://apireferencedocs.codebase.ebay.com', 'http://www.phoons.com/john/classes/coursesTaught.html']
LinkedIn Profiles =>  None
Resume  8 :
All URLs =>  []
LinkedIn Profiles =>  None
Resume  9 :
All URLs =>  []
LinkedIn Profiles =>  None


# Vaibhav Bhilare

In [298]:
import csv
import re

#Email Address
def check_email(string_to_search):
    regular_expression = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,3}", re.IGNORECASE)
    result = re.search(regular_expression, string_to_search)
    if result:
        result = result.group()
    return result
    #except:
     #   result=0
      #  return result

#LinkedIn Address
def check_linkedin(string_to_search):
    regular_expression1 = re.compile(r"https://"
                                    r"[A-Z]{2,3}"
                                    r".linkedin.com/in/"
                                    r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
    result = re.search(regular_expression1, string_to_search)
    try:
        result = result.group()
        return result
    except:
        regular_expression1 = re.compile(r"[A-Z]{2,3}"
                                        r".linkedin.com/in/"
                                        r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
        result = re.search(regular_expression1, string_to_search)
        try:
            result=result.group()
            return result
        except:
            regular_expression1 = re.compile(r"[A-Z]{2,3}"
                                        r".linkedin.com/"
                                        r"[-_a-z 0-9]{5,30}", re.IGNORECASE)
            result = re.search(regular_expression1, string_to_search)
            try:
                result=result.group()
                return result
            except:
                return None

#GitHub Address
def check_GitHub(string_to_search):
    regular_expression = re.compile(r"https://github.com/"
                                    r"[-_A-Z0-9]{5,30}", re.IGNORECASE)
    result = re.search(regular_expression, string_to_search)
    try:
        result = result.group()
        return result
    except:
        return None

#Contact Number
def check_phone_number(string_to_search):
    try:
        regular_expression = re.compile(r"\(?"  # open parenthesis
                                        r"(\d{3})?"  # area code
                                        r"\)?"  # close parenthesis
                                        r"[\s\.-]{0,2}?"  # area code, phone separator
                                        r"(\d{3})"  # 3 digit exchange
                                        r"[\s\.-]{0,2}"  # separator bbetween 3 digit exchange, 4 digit local
                                        r"(\d{4})",  # 4 digit local
                                        re.IGNORECASE)
        result = re.search(regular_expression, string_to_search)
        if result:
            result = result.groups()
            result = "-".join(result)
        return result
    except:
        return None

def main():
#    with open('Resume_Test.txt', 'r',encoding="utf8") as myfile:
    with open('BrandonThomasResume.txt', 'r') as myfile:
        data=myfile.read().replace('\n',' **** ')
    string="https://www.linkedin.com/in/vaibhav-bhilare"
    string_G="https://github.com/paragsal"
    #print(string)
    result=check_email(data)
    result_L=check_linkedin(data)
    result_P=check_phone_number(data)
    result_G=check_GitHub(data)
    print("Email Address:",result)
    print("Contact Number:",result_P)
    print("Linkedin Profile:",result_L)
    print("GitHub Profile:",result_G)
    #print(data)
main()

Email Address: brandonthomas280@gmail.com
Contact Number: 202-520-1835
Linkedin Profile: https://www.linkedin.com/in/jared-ross-15439b83
GitHub Profile: https://github.com/JaredJRoss
