In [1]:
import os
import re
import pandas as pd
import urllib
from nltk import SnowballStemmer
from gensim.models import Word2Vec
#import langdetect
import tika
import time
from tika import parser
import pickle

# Raw Articles pdf Data
Convert pdf to text using Tika apache server 

In [2]:
# specify the target data folder
target_dir = 'C:/Thesis/Data/Academy_of_Management_journal'

In [3]:
# prep pdf extraction
pdf_files = []
wrd_files = []

for f in os.listdir(target_dir):
    if f.endswith(".pdf") | f.endswith(".PDF"):
        thispdf = os.path.join(target_dir, f)
        pdf_files = pdf_files + [thispdf]
    if f.endswith(".doc") | f.endswith(".docx") | f.endswith(".DOC") | f.endswith(".DOCX"):
        thiswrd = os.path.join(target_dir, f)
        wrd_files = wrd_files + [thiswrd]

In [4]:
port = 4321 # port to use for Tika server (chosen arbitrarily)
server_str = 'http://localhost:' + str(port)
tika_path  = 'C:\Software\ tika-server-1.23.jar'
tika_run   = 'java -jar' + tika_path + ' --port ' + str(port)
print('Command to start Tika:')
print(tika_run)
print('---')
print('Number of pdf files: ' + str(len(pdf_files)))
print('Number of word files: ' + str(len(wrd_files)))

Command to start Tika:
java -jarC:\Software\ tika-server-1.23.jar --port 4321
---
Number of pdf files: 410
Number of word files: 0


# PDF Metadata

In [5]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_metadata = [parser.from_file(t, server_str, xmlContent=False)["metadata"] for t in (pdf_files + wrd_files)]

In [42]:
# save data metadata
pickle.dump(pdf_metadata, open( "C:/Thesis/Data/save/Human_Resource_Management_journal/metadata/metadata_all.p", "wb" ) )

# pdf content

In [None]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

#pdf_text = [parser.from_file(t, server_str, xmlContent=False)["content"] for t in (pdf_files + wrd_files)]

# step-1 Load pdf content

In [1]:
import pickle
#load the text data
pdf_text = pickle.load(open("C:/Thesis/Data/save/Human_Resource_Management_journal/save_alltextData.p", "rb"))


In [15]:
pdf_text[255]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfinders, keepers? attracting, motivating and retaining knowledge workers\n\n\n23human resource management journal, vol 13 no 4, 2003\n\nfinders, keepers? attracting, motivating\nand retaining knowledge workers\n\nfrank m. horwitz, university of cape town, south africa,  \n\nchan teng heng and hesan ahmed quazi, nanyang technological\n\nuniversity, singapore\n\nhuman resource management journal, vol 13 no 4, 2003, pages 23-44\n\nattracting, motivating and retaining knowledge workers have become important in a\n\nknowledge-based and tight labour market, where changing knowledge management\n\npractices and global convergence of technology has rede® ned the nature of work. while\n\nindividualisation of employment practices and team-based work may provide personal\n\nand organisational flexibilities, aligning hr and organisational strategies for\n\ncompetitive advantage has become more pro

# step-2 convert it to lowercase

In [2]:
#Convert each string to lowercase
for i in range(len(pdf_text)):
    pdf_text[i] = pdf_text[i].lower()

# step-3 remove reference

In [3]:
# remove references section 
import re
pdf_ref = [re.sub(r"(?is)\nreferences\n.+", "", f) for f in pdf_text]

# step-4 load metadata

In [4]:
import pickle
#load the text data
pdf_metadata = pickle.load(open("C:/Thesis/Data/save/Human_Resource_Management_journal/metadata/metadata_all.p", "rb"))

# step-5 get metadata = author, subject, title

In [5]:
pdf_title = []
pdf_Author = []
pdf_subject = []

for i in pdf_metadata:
    temp = ""
    auth_temp = ""
    subj_temp =""
    try:
        temp = i['title']
        auth_temp = i['Author']
        subj_temp = i['subject']
      
        pdf_title.append(temp.lower())
        pdf_Author.append(auth_temp.lower()) 
        pdf_subject.append(subj_temp.lower()) # it is included in stopwords list.
        
    except:
        #print("An exception occurred" )
        pdf_title.append(temp.lower())
        pdf_Author.append(auth_temp.lower())
        pdf_subject.append(subj_temp.lower())
        

In [24]:
pickle.dump(pdf_text_title, open("C:/Thesis/Data/save/Human_Resource_Management_journal/metadata/removed_author_title/metadata_all.p", "wb" ) )

In [None]:
# not required
def eliminate_method_result(method, discussion):
    
    import re    
    start = method
    end = discussion
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    pattern = start + '.*'+'(.|\n)*'+end
    pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]

    return pdf_method_dis

In [None]:
# not required
def eliminate_method_result_ref(method, dis, ref):
    
    import re

    start = method
    end = dis
    
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    pattern = start + '.*'+'(.|\n)*'+end
    pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]
    
    start_ref = ref
    pattern_ref = start_ref +'(.|\n)*$'

    pdf_ref= [re.sub(pattern_ref, '', f) for f in pdf_method_dis]
    
    return pdf_ref

# Pre-processing
Regular expression is used to clean up the pre-processed data and saved as a Master data

In [32]:
import re
pdf_p= [re.sub(r"-\n(\n)*", "", t)for t in pdf_ref]

pdf_p= [re.sub(r"\n(\n)*", "\n", t)for t in pdf_p]

pdf_p= [re.sub(r"\n", " ", t)for t in pdf_p]
# remove text with [] eg citation
pdf_p = [re.sub(r"\[[^)]*\]", "", t)for t in pdf_p]
#Remove numbers
pdf_p = [re.sub('[0-9]+', '', f) for f in pdf_p]
#pdf_p = [re.sub("r[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?", " ", t)for t in pdf_p]
pdf_p = [re.sub('[!@/%“”‘:#©β<>+=δχ*&$]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('’s','', f) for f in pdf_p]
pdf_p = [re.sub('’re','', f) for f in pdf_p]
pdf_p = [re.sub('’t','', f) for f in pdf_p]
pdf_p = [re.sub('’ve','', f) for f in pdf_p]
pdf_p = [re.sub('’ll','', f) for f in pdf_p]
pdf_p = [re.sub('’m','', f) for f in pdf_p]

pdf_p = [re.sub('[:()-]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('[\.!?]+ ', 'XYZXYZ', t) for t in pdf_p]
# \w will match alphanumeric characters and underscores
# [^\w] will match anything that's not alphanumeric or underscore
pdf_p = [re.sub(r'[^\w]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('XYZXYZ', '.', f) for f in pdf_p]
pdf_p = [re.sub(' +', ' ', f) for f in pdf_p]
# Replace multiple dots with one dot
pdf_p = [re.sub('\.\.+',".", f) for f in pdf_p]
pdf_p = [re.sub(r'\b\w{1,3}\b',"", f) for f in pdf_p]
pdf_p = [re.sub(' +', ' ', f) for f in pdf_p]

pdf_p = [re.sub("^\s+","", f) for f in pdf_p]# remove the front space
pdf_p = [re.sub("\s+\Z","", f) for f in pdf_p]#remove the back space

In [49]:
#pdf_p

In [44]:
# author first and last name extracted from metadata
author_list = ['neil','stewart','spotlight','martin','jonghoon','michael','baer','kevin','corley','pratima','bansal','kconnolly','battilana','dorado']

author_list += ['elena','belogolovsky','peter','bamberger','karlene','roberts','chris','bingham','campbell','seung','hwan','jeong','scott','graffin']

author_list += ['robert','messen','andrew','carton','brice','roberts','dattée','oliver','alexy','erkko','autio','sreedhari','desai','patricio']

author_list += ['duran','nadine','kammerlander','marc','essen','thomas','zellweger','john','joseph','wshong','gokhan','ertug','tamar','yogev']


author_list += ['yonghoon','lee','peter','hedström','kibler','ewald','markus','perkmann','brenda','flannery','douglas','michael','florin','lubatkin']


author_list += ['william','schulze','raghu','garud','sanjay','jain','arun','kumaraswamy','gerard','george','christopher','corbishley','jane','khayesi']


author_list += ['martine','haas','laszlo','tihanyi','wakenshaw','gibbons','deborah','brian','gunia','joo','hun','han','saehee','kang']



author_list += ['rebecca','kehoe','david','lepak','hausknecht','nathan','hiller','robert','vance','hitt','tina','dacin','edward','levita']



author_list += ['jean','arregle','luc','anca','borza','fmurray','philippe','jacquart','antonakis','jason','jay','fuller','elfenbeinh','cynthia']

author_list += ['kim','jensen','lorraine','crystal','hkpu','margaret','luciano','amy','bartels','lauren','innocenzo','travis','maynard','mathieu']


author_list += ['hardy','smagui','johanna','mair','ignasi','marti','marc','ventresca','dellring','jcarson','mmcardle','erez','mia','mathieu']


author_list += ['monin','niels','noorderhaven','eero','vaara','kroon','jeroen','neckebrouck','gina','dokko','lisa','hisae','nishii','sonja']


author_list += ['opper','donde','ashmos','plowman','lakami','baker','tammy','beck','mukta','kulkarni','stephanie','solansky','deandra','villarreal']

author_list += ['ridge','ingram','aaron','hill','bloy','drew','harry','dwight','lemke','richard','dino','staf','guillaume','soenen']

author_list += ['tessa','melkonian','maureen','ambrose','evans','bennett','tepper','nikolaos','dimotakis','schurer','lambert','joel','koopman','fadel']


author_list += ['matta','hee','man','park','wongun','goo','tepper','varkey','titus','owen','parker','francesca','gino','bass']


author_list += ['erin','charlie','trevor','home','computer','martins','martín','esmt','european','varkey','gmbh','cwbauman','school','francesca','gino','bass']



In [45]:
# Function to filter out the stopwords and authors names
def filterWords(msg, words):
    m = msg.split(' ')
    words_end = [w + '.' for w in words]
    filtered_words = list(filter(lambda word: word not in words, m))
    filtered_words = [(w if w not in words_end else '.') for w in filtered_words]
    result = ' '.join(filtered_words)
    return(result)

In [46]:
data_proc_auth = [filterWords(f, author_list) for f in pdf_p]

In [50]:
#data_proc_auth

In [52]:
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['january', 'february', 'march', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
# metadata- subject
stopwords += ['business administration', 'economics and finance']
# author name to remove
stopwords +=['karina nielsen','kevin daniels','elaine', 'jaewan yang','andrew O herdman','amanda', 'sabine','kerstin','kertin','sagepub','journalspermissions','catheine','chidiebere ogbonnaya','john wiley','sons ltd','martin','jeffrey B arthur']

In [53]:
data_proc = [filterWords(f, stopwords) for f in data_proc_auth]

In [68]:
 data_proc[2]

'reducing perceptions overqualification impact satisfaction dual roles interpersonal relationships work reducing perceptions overqualification impact satisfaction dual roles interpersonal relationships work kerstinalfes chair organisation andhuman resourcemanagement escpeurope wirtschaftshochschule berlin shantz department management iéseg management lemcnrs baalen department human resource studies tilburg university human resource management journal pages sizeable portion working population perceives overqualified jobs.this problematic given research consistently shows beliefs translate lower levels satisfaction.hence behoves human resource management scholars identify factors influence perceptions overqualification moderators reduce negative effect perceived overqualification satisfaction. study present moderated path model posits quality relationships employees hold leader team antecedents perceived overqualification hypothesised weaken negative relationship perceived overqualificat

# Get country name

In [56]:
import pycountry

country_list = []
for country in pycountry.countries:
    a = (country.name).lower()
    country_list.append(a) 

In [57]:
data_proc_country =[]
for j in data_proc:
    line = j
    for k in country_list:
        line = re.sub(k, '', line)
        
    data_proc_country.append(line)

In [60]:
# data_proc_country[22]

In [58]:
# save data
pickle.dump(data_proc_country, open( "C:/Thesis/Data/save/Human_Resource_Management_journal/metadata/removed_country/metadata_all.p", "wb" ) )

In [61]:
# Authors names are saved already (code is available at author name folder-get name list notebook)
#load authors names
import pickle
author_name = pickle.load(open("C:/Thesis/Data/save/Master_Data/auth_ref_filter_num_char/auth_Human_Resource_Management_journal.p", "rb"))

In [62]:
# GET REFERENCE LISTS OF STOPWORDS, I.E. WORDS THAT ARE ESSENTIALLY MEANINGLESS
# Get all the list of authors names

from nltk.corpus import stopwords

sw = stopwords.words('english') + author_name 
#print(sw)

In [63]:
data_proc_1 = [filterWords(f, sw) for f in data_proc_country]

In [67]:
data_proc_1[2]

'reducing perceptions overqualification satisfaction dual roles interpersonal relationships work reducing perceptions overqualification satisfaction dual roles interpersonal relationships work kerstinalfes chair organisation andhuman resourcemanagement escpeurope wirtschaftshochschule iéseg lemcnrs baalen human resource studies tilburg human resource pages sizeable portion working population perceives overqualified jobs.this problematic given consistently shows beliefs translate lower levels satisfaction.hence behoves human resource scholars identify factors influence perceptions overqualification moderators reduce negative effect perceived overqualification satisfaction. present moderated path model posits quality relationships employees hold leader antecedents perceived overqualification hypothesised weaken negative relationship perceived overqualification satisfaction.survey gathered organisations  supported model.implications theory discussed.contact escp europe wirtschaftshochschu

In [64]:
data_proc_2 = [re.sub(' \.+', ' ', f) for f in data_proc_1]
data_proc_2 = [re.sub(' +', ' ', f) for f in data_proc_2]
data_proc_2 = [re.sub('http', '', f) for f in data_proc_2]
data_proc_2 = [re.sub('www', '', f) for f in data_proc_2]
data_proc_2 = [re.sub('iweb', '', f) for f in data_proc_2]
data_proc_2 = [re.sub(' +', ' ', f) for f in data_proc_2]


In [66]:
data_proc_2[2]

'reducing perceptions overqualification satisfaction dual roles interpersonal relationships work reducing perceptions overqualification satisfaction dual roles interpersonal relationships work kerstinalfes chair organisation andhuman resourcemanagement escpeurope wirtschaftshochschule iéseg lemcnrs baalen human resource studies tilburg human resource pages sizeable portion working population perceives overqualified jobs.this problematic given consistently shows beliefs translate lower levels satisfaction.hence behoves human resource scholars identify factors influence perceptions overqualification moderators reduce negative effect perceived overqualification satisfaction. present moderated path model posits quality relationships employees hold leader antecedents perceived overqualification hypothesised weaken negative relationship perceived overqualification satisfaction.survey gathered organisations supported model.implications theory discussed.contact escp europe wirtschaftshochschul

In [69]:
# save data
pickle.dump(data_proc_2, open( "C:/Thesis/Data/save/Master_Data/MD_1/data_proc_latest/HRMJ_data_proc.p", "wb" ) )

In [70]:
#import io
with open("C:/Thesis/Data/save/Master_Data/MD_1/data_proc_latest/HRMJ_data_proc_txt.txt", "w",encoding="utf-8") as outfile:
    for i in range(len(data_proc_2)):
        outstring = ""
        outstring += str(data_proc_2[i])
        outfile.write(outstring)