/
linkedin__analyze_titles.py
59 lines (47 loc) · 1.59 KB
/
linkedin__analyze_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import sys
import nltk
import csv
from prettytable import PrettyTable
CSV_FILE = sys.argv[1]
transforms = [
('Sr.', 'Senior'),
('Sr', 'Senior'),
('Jr.', 'Junior'),
('Jr', 'Junior'),
('CEO', 'Chief Executive Officer'),
('COO', 'Chief Operating Officer'),
('CTO', 'Chief Technology Officer'),
('CFO', 'Chief Finance Officer'),
('VP', 'Vice President'),
]
csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]
# Read in a list of titles and split apart
# any combined titles like "President/CEO"
# Other variations could be handled as well such
# as "President & CEO", "President and CEO", etc.
titles = []
for contact in contacts:
titles.extend([t.strip() for t in contact['Job Title'].split('/')
if contact['Job Title'].strip() != ''])
# Replace common/known abbreviations
for i in range(len(titles)):
for transform in transforms:
titles[i] = titles[i].replace(*transform)
# Print out a table of titles sorted by frequency
pt = PrettyTable(['Title', 'Freq'])
pt.align['Title'] = 'l'
titles_fdist = nltk.FreqDist(titles)
[pt.add_row([title, freq]) for (title, freq) in titles_fdist.items() if freq > 1]
print(pt)
# Print out a table of tokens sorted by frequency
tokens = []
for title in titles:
tokens.extend([t.strip(',') for t in title.split()])
pt = PrettyTable(['Token', 'Freq'])
pt.align['Token'] = 'l'
tokens_fdist = nltk.FreqDist(tokens)
[pt.add_row([token, freq]) for (token, freq) in tokens_fdist.items() if freq > 1
and len(token) > 2]
print(pt)