# Sequence to Sequence model to extract summary,objective from resume (input: resume line by line)


## 207 resumes


In [21]:
from __future__ import print_function
import csv,os
import collections
from keras.models import load_model,Sequential
from keras.layers import Input,LSTM,Dense,Activation
from keras.callbacks import ModelCheckpoint
import numpy as np
batch_size=128
epochs=20
latent_dim=128
data_path="/home/santhosh/resumes_folder/custom_annotator/annotator-server/static/files/Data_Tracter_Resumes_in_TXT/csv"

## vectorizing the data

In [22]:
input_resumes=[]
output_summary=[]
input_tokens=collections.Counter()
output_tokens=set()
files=os.listdir(data_path)
files.sort()
count=0
for file in files[:50]:
    with open(data_path+'/'+file,'r') as csv_file:
        reader=csv.reader(csv_file)
        count=0
        for Input_text,output_text in reader:
            Input_text=Input_text.strip().lower()
            Input_text=Input_text
            
            input_resumes.append(Input_text)
            output_summary.append(output_text)
            
            for word in Input_text.split():
                if word not in input_tokens:
                    input_tokens[word]+=1
            for word in output_text.split():
                if word not in output_tokens:
                    output_tokens.add(word)
            if count==50:
                break
            count+=1


In [23]:
print(len(input_tokens))
num_encoder_tokens=min(len(input_tokens),4000)
input_tokens=[word for word,count in input_tokens.most_common(num_encoder_tokens-1)]
input_tokens=sorted(list(input_tokens))
output_tokens=sorted(list(output_tokens))
num_decoder_tokens=len(output_tokens)
max_encoder_seq_len=max([len(text.split()) for text in input_resumes])
max_decoder_seq_len=max([len(text.split()) for text in output_summary])

print('Number of samples:',len(input_resumes))
print('number of unique input token:',num_encoder_tokens)
print('number of unique output token:',num_decoder_tokens)
print('Max Sequence length for inputs:',max_encoder_seq_len)
print('Max Sequence length for outputs:',max_decoder_seq_len)


5073
Number of samples: 2530
number of unique input token: 4000
number of unique output token: 2
Max Sequence length for inputs: 26
Max Sequence length for outputs: 1


## defining token2index

In [24]:
input_token2index=dict([(word,i) for i,word in enumerate(input_tokens)])
output_token2index=dict([(word,i) for i,word in enumerate(output_tokens)])
output_index2token=dict([(i,word) for i,word in enumerate(output_tokens)])
input_token2index['UNK']=num_encoder_tokens-1

In [25]:
output_index2token

{0: '0', 1: '1'}

## defing encoder_input,decoder_input and decoder_output

In [26]:
encoder_input_data=np.zeros((len(input_resumes),max_encoder_seq_len,num_encoder_tokens),dtype='float32')
decoder_target_data=np.zeros((len(input_resumes),num_decoder_tokens),dtype='float32')


## creating training dataset

In [27]:
for i,(input_text,target_text) in enumerate(zip(input_resumes,output_summary)):
    for t,word in enumerate(input_text.split()[:max_encoder_seq_len]):
        if word not in input_token2index:
            word="UNK"
        encoder_input_data[i,t,input_token2index[word]]=1
    decoder_target_data[i,output_token2index[target_text]]=1
        

In [28]:
model = Sequential()
model.add(LSTM(latent_dim, return_sequences=False,
input_shape=(max_encoder_seq_len,num_encoder_tokens,)))
model.add(Dense(num_decoder_tokens))
model.add(Activation("softmax"))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               2114048   
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_3 (Activation)    (None, 2)                 0         
Total params: 2,114,306
Trainable params: 2,114,306
Non-trainable params: 0
_________________________________________________________________


## function to test model

## Run training

In [None]:

iteration=0

# load weights
print('loading the weights')
model=load_model('resume_line_classification.h5')

# estimate accuracy on whole dataset using loaded weights
scores = model.evaluate(encoder_input_data,decoder_target_data ,verbose=0)
print("%s: %.2f%%\n\n" % (model.metrics_names[1], scores[1]*100))
print("Testing Samples\n"+"-"*50)
print("-"*50)
index=int(np.random.randint(len(input_resumes)/40*0.8)*40)
test_input=""
test_output=""
for i in range(50):
    encoded_input_sequence=encoder_input_data[index: index + 1]
    output_sequence=model.predict(encoded_input_sequence, verbose=0)[0]
    output_sequence = output_index2token[np.argmax(output_sequence)]
    test_input+=input_resumes[index]+'\n'
    if output_sequence=="1":
        output_sequence=input_resumes[index]+'\n'
    else:
        output_sequence=''
    test_output+=output_sequence
    index+=1
print("-"*50)
print(test_input)
print("---OUTPUT-----")
print(test_output)
print(" "*50+"-"*50)

iteration_file="/home/santhosh/resumes_folder/keras/extract_summary_and_objective/iteration_resume_line_classification.txt"
try:
    file=open(iteration_file,'r')
    last_line=file.read().split('\n')[-2]
    print('file_data,',last_line)
    iteration=int(last_line.split(':')[1])
    #print(iteration)
    file.close()
    
except:
    print('no file exist')

# checkpoint
filepath="resume_line_classification_checkpoints.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

while True:
    print('Iteration:',iteration+1)
    #training
    model.fit(encoder_input_data,decoder_target_data ,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,callbacks=callbacks_list)
    
    #prepare sample_data to test 5 samples:
    print("-"*50)
    index=int(np.random.randint(len(input_resumes)/40*0.8)*40)
    test_input=""
    test_output=""
    for i in range(50):
        
        encoded_input_sequence=encoder_input_data[index: index + 1]
        output_sequence=model.predict(encoded_input_sequence, verbose=0)[0]
        output_sequence = output_index2token[np.argmax(output_sequence)]
        test_input+=input_resumes[index]+'\n'
        if output_sequence=="1":
            output_sequence=input_resumes[index]+'\n'
        else:
            output_sequence=''
        test_output+=output_sequence
        index+=1
    print("-"*50)
    print(test_input)
    print("---OUTPUT-----")
    print(test_output)
    print(" "*50+"-"*50)
        
        
        
    # Save model
    file=open(iteration_file,'a')
    file.write('iteration:'+str(iteration+1)+'\n')
    file.close()
    iteration+=1
    model.save('resume_line_classification.h5')

loading the weights
acc: 96.13%


Testing Samples
--------------------------------------------------
--------------------------------------------------
--------------------------------------------------
framework : gulp, bootstrap
css preprocessor : sass , less
responsive design : desktop and mobile devices (using media queries)
design tool : adobe photoshop
i n t e r n s h i p
worked as intern at maxheap technology (commonfloor.com), bangalore from march 07, 2012
to may 31,2012.
project: to build microsite and make it live on production everyday.
p r o j e c t d e t a i l s
worked on various projects for commonfloor, near about all pages. some of the latest projects are as
follows:
latest destop projects are :
home page
link: http://www.commonfloor.com
buy/rent page

abhishek dangwal
vignana nagar, bangalore
behind hal police station
dangwal.abhishek@gmail.com
ph: +91 8867210018
objective
to join an organization of good repute that will recognize and utilize my skills fully and offer 