# Sequence to Sequence model to extract summary,objective from resume (input: resume line by line)


## 207 resumes


In [100]:
from __future__ import print_function
import csv,os
import collections
from keras.models import Model,load_model
from keras.layers import Input,LSTM,Dense
from keras.callbacks import ModelCheckpoint
import numpy as np

batch_size=90
epochs=25
latent_dim=128
data_path="/home/santhosh/resumes_folder/custom_annotator/annotator-server/static/files/Data_Tracter_Resumes_in_TXT/csv"

## vectorizing the data

In [117]:
input_resumes=[]
output_summary=[]
input_tokens=collections.Counter()
output_tokens=set()
files=os.listdir(data_path)
count=0
line=0;
for file in files[:25]:
    with open(data_path+'/'+file,'r') as csv_file:
        reader=csv.reader(csv_file)
        count=0
        for Input_text,output in reader:
            Input_text=Input_text.strip().lower()
            Input_text=Input_text+' \n'
            if(output=='1'):
                output_text=Input_text+' \n'
            else:
                output_text=""
            
            # We use "<SOL>" as the "start sequence" character
            # for the targets, and "<EOL>" as "end sequence" character.
            output_text='<SOL> '+output_text+' <EOL>'
            input_resumes.append(Input_text)
            output_summary.append(output_text)
            for word in Input_text.split():
                if word not in input_tokens:
                    input_tokens[word]+=1
            for word in output_text.split():
                if word not in output_tokens:
                    output_tokens.add(word)
            if count==40:
                break
            count+=1


In [118]:
print(len(input_tokens))
num_encoder_tokens=min(len(input_tokens),1000)
input_tokens=[word for word,count in input_tokens.most_common(num_encoder_tokens-1)]
input_tokens=sorted(list(input_tokens))
output_tokens=sorted(list(output_tokens))
num_decoder_tokens=len(output_tokens)
max_encoder_seq_len=max([len(text.split()) for text in input_resumes])
max_decoder_seq_len=max([len(text.split()) for text in output_summary])

print('Number of samples:',len(input_resumes))
print('number of unique input token:',num_encoder_tokens)
print('number of unique output token:',num_decoder_tokens)
print('Max Sequence length for inputs:',max_encoder_seq_len)
print('Max Sequence length for outputs:',max_decoder_seq_len)


2611
Number of samples: 1018
number of unique input token: 1000
number of unique output token: 1215
Max Sequence length for inputs: 24
Max Sequence length for outputs: 24


## defining token2index

In [119]:
input_token2index=dict([(word,i) for i,word in enumerate(input_tokens)])
output_token2index=dict([(word,i) for i,word in enumerate(output_tokens)])
input_token2index['UNK']=num_encoder_tokens-1

## defing encoder_input,decoder_input and decoder_output

In [120]:
encoder_input_data=np.zeros((len(input_resumes),max_encoder_seq_len,num_encoder_tokens),dtype='float32')
decoder_input_data=np.zeros((len(input_resumes),max_decoder_seq_len,num_decoder_tokens),dtype='float32')
decoder_target_data=np.zeros((len(input_resumes),max_decoder_seq_len,num_decoder_tokens),dtype='float32')


## creating training dataset

In [121]:
for i,(input_text,target_text) in enumerate(zip(input_resumes,output_summary)):
    for t,word in enumerate(input_text.split()[:max_encoder_seq_len]):
        if word not in input_token2index:
            word="UNK"
        encoder_input_data[i,t,input_token2index[word]]=1
        
    for t,word in enumerate(target_text.split()):
        decoder_input_data[i,t,output_token2index[word]]=1
        # decoder_target_data is ahead of decoder_input_data by one timestep
        if t>0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i,t-1,output_token2index[word]]=1


## Define an input sequence and process it.

In [122]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


## Set up the decoder, using encoder_states as initial state.

In [123]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

## Define the model that will turn
## `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

In [124]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=["accuracy"])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, None, 1000)   0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, None, 1215)   0                                            
__________________________________________________________________________________________________
lstm_17 (LSTM)                  [(None, 128), (None, 578048      input_33[0][0]                   
__________________________________________________________________________________________________
lstm_18 (LSTM)                  [(None, None, 128),  688128      input_34[0][0]                   
                                                                 lstm_17[0][1]                    
          

## function to test model

In [125]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_word_index = dict(
    (i, word) for word, i in input_token2index.items())
reverse_target_word_index = dict(
    (i, word) for word, i in output_token2index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, output_token2index['<SOL>']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += ' '+sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '<EOL>' or
           len(decoded_sentence.split()) > max_decoder_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]
    if('<EOL>' in decoded_sentence):
        decoded_sentence=" ".join(decoded_sentence.split()[:-1])
    return decoded_sentence


## Run training

In [None]:

iteration=0
"""
# load weights
print('loading the weights')
model=load_model('resume_level.h5')

# estimate accuracy on whole dataset using loaded weights
scores = model.evaluate([encoder_input_data, decoder_input_data], decoder_target_data,verbose=0)
print("%s: %.2f%%\n\n" % (model.metrics_names[1], scores[1]*100))
print("Testing Samples\n"+"-"*50)
for i in range(1):
    index=np.random.randint(len(input_resumes))
    encoded_input_sequence=encoder_input_data[index: index + 1]
    output_sequence=decode_sequence(encoded_input_sequence)
    print("-"*50)
    print(input_resumes[index])
    print(" "*50)
    print("*"*50+"\nOUTPUT"+" "*50)
    print(output_sequence)
    print("-"*50+"\n"+" "*50)
"""
iteration_file="/home/santhosh/resumes_folder/keras/extract_summary_and_objective/iteration_resume_line_level.txt"
try:
    file=open(iteration_file,'r')
    last_line=file.read().split('\n')[-2]
    print('file_data,',last_line)
    iteration=int(last_line.split(':')[1])
    #print(iteration)
    file.close()
    
except:
    print('no file exist')

# checkpoint
filepath="resume_line_level_checkpoints.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

while True:
    print('Iteration:',iteration+1)
    #training
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,callbacks=callbacks_list)
    #prepare sample_data to test 5 samples:
    print("-"*50)
    index=int(np.random.randint(len(input_resumes)/40*0.8))
    test_input=""
    test_output=""
    for i in range(40):
        encoded_input_sequence=(encoder_input_data[index]).reshape((1,max_encoder_seq_len,num_encoder_tokens))
        #print(encoded_input_sequence.shape,max_encoder_seq_len,num_encoder_tokens,max_encoder_seq_len*num_encoder_tokens)
        #print(encoded_input_sequence)
        #.reshape((1,max_encoder_seq_len,num_encoder_tokens))
        output_sequence=decode_sequence(encoded_input_sequence)
        test_input+=input_resumes[index]
        test_output+=output_sequence
        index+=1
    print("-"*50)
    print(test_input)
    print("---OUTPUT-----")
    print(test_output)
    print(" "*50+"-"*50)
        
        
        
    # Save model
    file=open(iteration_file,'a')
    file.write('iteration:'+str(iteration+1)+'\n')
    file.close()
    iteration+=1
    model.save('resume_line_level.h5')

file_data, iteration:6
Iteration: 7
Train on 814 samples, validate on 204 samples
Epoch 1/25


  '. They will not be included '


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
--------------------------------------------------
individuality is recognized by work, and to achieve professional satisfaction by meeting 
higher responsibilities and involving in competent work areas. 
profile summary 
• experienced in oops implementation with javascript and enyo for developing and delivering projects. 
• have good exposure and experience in web application framework development using html5, css3, 
and jquery. 
• experienced in developing projects using mvc techniques. 
• 6 months onsite experience for web application framework support. 
• 2 months onsite experience for reference application (file browser) development. 
• 6 months industrial

Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
--------------------------------------------------
anshu_resume_latest 
anshu agrawal phone no - 7829401605 
email id - anshu099.agrawal@gmail.com 
github id - anshu099 
it professional with over 2.6 years of work experience in web application framework 
development & currently working in lg soft india as a senior software engineer. 
objective: 
seeking a position to utilize my technical and leadership skills and abilities in the 
information technology industry, a job where growth prospects are unlimited and 
individuality is recognized by work, and to achieve professional satisfaction by meeting 
higher responsibilities and involving in competent work areas. 
profile summary 
• experienced in oops implementation with javascript and enyo for developing and delivering projects. 
• have good exposure and experience in web application framework development using html5, css3, 
and jquery. 
•

Iteration: 10
Train on 814 samples, validate on 204 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
--------------------------------------------------
• experienced in oops implementation with javascript and enyo for developing and delivering projects. 
• have good exposure and experience in web application framework development using html5, css3, 
and jquery. 
• experienced in developing projects using mvc techniques. 
• 6 months onsite experience for web application framework support. 
• 2 months onsite experience for reference application (file browser) development. 
• 6 months industrial training in advanced computing. 
• sound knowledge of porting web application in android phone and 

Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
--------------------------------------------------
• sound knowledge of porting web application in android phone and debugging using web inspector. 
• 3 months project experience in java (jsp & servlets) and oracle. 
• worked on technologies like less.js and node.js 
• efficient in debugging node application using node inspector. 
• experienced in creating web application using worker thread which increases performance. 
technical skill details 
programming 
languages 
c, c++ have relevant experience in programming and 
debugging applications on windows and linux 
platform 
java undergone 6 months training in jsp, servlets 
and core java 
scripting 
languages 
javascript have sound knowledge of javascript 
web designing html5, xml, css3, 
json 
expert with relevant exper

Train on 814 samples, validate on 204 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--------------------------------------------------
--------------------------------------------------
• experienced in developing projects using mvc techniques. 
• 6 months onsite experience for web application framework support. 
• 2 months onsite experience for reference application (file browser) development. 
• 6 months industrial training in advanced computing. 
• sound knowledge of porting web application in android phone and debugging using web inspector. 
• 3 months project experience in java (jsp & servlets) and oracle. 
• worked on technologies like less.js and node.js 
• efficient in debugging node application using node inspector. 
• experienc