-
Notifications
You must be signed in to change notification settings - Fork 491
/
linkedin__cluster_contacts_by_title.py
98 lines (77 loc) · 3.23 KB
/
linkedin__cluster_contacts_by_title.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
import sys
import csv
from nltk.metrics.distance import masi_distance
CSV_FILE = sys.argv[1]
DISTANCE_THRESHOLD = 0.34
DISTANCE = masi_distance
def cluster_contacts_by_title(csv_file):
transforms = [
('Sr.', 'Senior'),
('Sr', 'Senior'),
('Jr.', 'Junior'),
('Jr', 'Junior'),
('CEO', 'Chief Executive Officer'),
('COO', 'Chief Operating Officer'),
('CTO', 'Chief Technology Officer'),
('CFO', 'Chief Finance Officer'),
('VP', 'Vice President'),
]
separators = ['/', 'and', '&']
csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]
# Normalize and/or replace known abbreviations
# and build up list of common titles
all_titles = []
for i in range(len(contacts)):
if contacts[i]['Job Title'] == '':
contacts[i]['Job Titles'] = ['']
continue
titles = [contacts[i]['Job Title']]
for title in titles:
for seperator in separators:
if title.find(seperator) >= 0:
titles.remove(title)
titles.extend([title.strip() for title in title.split(seperator)
if title.strip() != ''])
for transform in transforms:
titles = [title.replace(*transform) for title in titles]
contacts[i]['Job Titles'] = titles
all_titles.extend(titles)
all_titles = list(set(all_titles))
clusters = {}
for title1 in all_titles:
clusters[title1] = []
for title2 in all_titles:
if title2 in clusters[title1] or clusters.has_key(title2) and title1 \
in clusters[title2]:
continue
distance = DISTANCE(set(title1.split()), set(title2.split()))
if distance < DISTANCE_THRESHOLD:
clusters[title1].append(title2)
# Flatten out clusters
clusters = [clusters[title] for title in clusters if len(clusters[title]) > 1]
# Round up contacts who are in these clusters and group them together
clustered_contacts = {}
for cluster in clusters:
clustered_contacts[tuple(cluster)] = []
for contact in contacts:
for title in contact['Job Titles']:
if title in cluster:
clustered_contacts[tuple(cluster)].append('%s %s.'
% (contact['First Name'], contact['Last Name'][0]))
return clustered_contacts
if __name__ == '__main__':
clustered_contacts = cluster_contacts_by_title(CSV_FILE)
for titles in clustered_contacts:
common_titles_heading = 'Common Titles: ' + ', '.join(titles)
print common_titles_heading
descriptive_terms = set(titles[0].split())
for title in titles:
descriptive_terms.intersection_update(set(title.split()))
descriptive_terms_heading = 'Descriptive Terms: ' \
+ ', '.join(descriptive_terms)
print descriptive_terms_heading
print '-' * max(len(descriptive_terms_heading), len(common_titles_heading))
print '\n'.join(clustered_contacts[titles])
print