In [1]:
import spacy
import pickle
import random

In [7]:
train_data = pickle.load(open('train_data.pkl', 'rb'))

In [9]:
 train_data[0][1]

{'entities': [(1749, 1755, 'Companies worked at'),
  (1696, 1702, 'Companies worked at'),
  (1417, 1423, 'Companies worked at'),
  (1356, 1793, 'Skills'),
  (1209, 1215, 'Companies worked at'),
  (1136, 1248, 'Skills'),
  (928, 932, 'Graduation Year'),
  (858, 889, 'College Name'),
  (821, 856, 'Degree'),
  (787, 791, 'Graduation Year'),
  (744, 750, 'Companies worked at'),
  (722, 742, 'Designation'),
  (658, 664, 'Companies worked at'),
  (640, 656, 'Designation'),
  (574, 580, 'Companies worked at'),
  (555, 573, 'Designation'),
  (470, 493, 'Companies worked at'),
  (444, 469, 'Designation'),
  (308, 314, 'Companies worked at'),
  (234, 240, 'Companies worked at'),
  (175, 198, 'Companies worked at'),
  (93, 137, 'Email Address'),
  (39, 48, 'Location'),
  (13, 38, 'Designation'),
  (0, 12, 'Name')]}

## NER With SpaCy 

In [4]:
import warnings

In [5]:
nlp = spacy.blank('en')


def train_model(train_data):
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
        
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    #--------------------------
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
   
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        optimizer = nlp.begin_training()
            
        for itn in range(20):
            print('Starting iterations ', str(itn))
            random.shuffle(train_data)
            
            
            losses = {}
            index = 0
            for text, annotations in train_data:
                print(index)
                index = index + 1
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
#                     print(text)
                
            print(losses)


In [6]:
train_model(train_data)

Starting iterations  0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
{'ner': 13274.893127516101}
Starting iterations  1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
{'ner': 4024.485801670519}
Starting iterations  12
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
9

In [7]:
nlp.to_disk('nlp_model')

## Model Testing

In [6]:
nlp_model = spacy.load('nlp_model')

In [19]:
text = train_data[1][0]

In [20]:
doc = nlp_model(text)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Harini Komaravelli
DESIGNATION                   - Test Analyst
COMPANIES WORKED AT           - Oracle, Hyderabad
LOCATION                      - Hyderabad
DESIGNATION                   - QA Analyst
COMPANIES WORKED AT           - Oracle
COMPANIES WORKED AT           - Oracle, Hyderabad
COMPANIES WORKED AT           - Infosys Ltd
LOCATION                      - Hyderabad
LOCATION                      - Hyderabad
LOCATION                      - Hyderabad
COMPANIES WORKED AT           - Oracle, Hyderabad
DESIGNATION                   - QA Analyst
COMPANIES WORKED AT           - 6 years
COMPANIES WORKED AT           - Oracle
DEGREE                        - MCA
COLLEGE NAME                  - Osmania University
DEGREE                        - B.Sc. in Computer Science
SKILLS                        - Functional Testing, Blue Prism, Qtp


## CV Parsing from pdf data

In [4]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.18.14-cp38-cp38-win_amd64.whl (5.4 MB)
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.18.14


In [2]:
import sys, fitz

In [8]:
fname = 'Alice Clark CV.pdf'
doc = fitz.open(fname)
text = ""

for page in doc:
    text = text + str(page.getText())

In [9]:
tx = " ".join(text.split('\n'))

In [27]:
tx

'Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, br

In [10]:
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COLLEGE NAME                  - Indian Institute of Technology
SKILLS                        - Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; 

In [11]:
import os

files = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith('.pdf')]

In [12]:
files

['Alice Clark CV.pdf',
 'Internship Resume.pdf',
 'Sample resume.pdf',
 'Smith Resume.pdf']

In [13]:
detail = {}
for f in files:
    fname = f
    doc = fitz.open(fname)
    text = ""
    Skills ="" 
    for page in doc:
        text = text + str(page.getText())
        tx = " ".join(text.split('\n'))
        
    doc = nlp_model(tx)
    for ent in doc.ents:
        if ent.label_.upper() == 'NAME':
            Name=ent.text
        if ent.label_.upper() == 'SKILLS':
            Skills=Skills+' '+ent.text
            
    detail[Name]=Skills
    

In [14]:
for d in detail:
    print(f'{d:{30}}- {detail[d]}')
    print()
    print('- ' * 60)

Alice Clark                   -  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised junior developers throughout project lifecycle and provided technical assistance

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Yan Liu                       -  Programming Languages: C/C++, Pascal, PowerBuilder, Matlab, TeX/LaTeX.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Aman Deshmukh                 -  Progra

In [7]:
for d in detail:
    print(type(detail[d]))

<class 'str'>
<class 'str'>


In [15]:
for d in sorted(detail, key=lambda d: len(detail[d]), reverse=True):
    print(f'{d:{30}}- {detail[d]}')
    print()
    print('- ' * 60)

Alice Clark                   -  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised junior developers throughout project lifecycle and provided technical assistance

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Aman Deshmukh                 -  Programming in C and C++ Data Structures Html, CSS, JavaScript(intermediate) Complete dedication and commitment towards my work.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 