In [37]:
import os
import re
import pandas as pd
import urllib
from nltk import SnowballStemmer
from gensim.models import Word2Vec
#import langdetect
import tika
import time
from tika import parser
import pickle

# Raw Articles pdf Data
Convert pdf to text using Tika apache server 

In [38]:
# specify the target data folder

target_dir = 'C:/Thesis/Data/Personnel_Psychology'

In [39]:
# prep pdf extraction
pdf_files = []
wrd_files = []

for f in os.listdir(target_dir):
    if f.endswith(".pdf") | f.endswith(".PDF"):
        thispdf = os.path.join(target_dir, f)
        pdf_files = pdf_files + [thispdf]
    if f.endswith(".doc") | f.endswith(".docx") | f.endswith(".DOC") | f.endswith(".DOCX"):
        thiswrd = os.path.join(target_dir, f)
        wrd_files = wrd_files + [thiswrd]

In [40]:
port = 4321 # port to use for Tika server (chosen arbitrarily)
server_str = 'http://localhost:' + str(port)
tika_path  = 'C:\Software\ tika-server-1.23.jar'
tika_run   = 'java -jar' + tika_path + ' --port ' + str(port)
print('Command to start Tika:')
print(tika_run)
print('---')
print('Number of pdf files: ' + str(len(pdf_files)))
print('Number of word files: ' + str(len(wrd_files)))

Command to start Tika:
java -jarC:\Software\ tika-server-1.23.jar --port 4321
---
Number of pdf files: 1285
Number of word files: 0


In [64]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_metadata = [parser.from_file(t, server_str, xmlContent=False)["metadata"] for t in (pdf_files + wrd_files)]

# Text Data

Eliminate title, sections (eg: content between Method and Discussion) and Reference. It is saved as a pre-processed data

In [50]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_text = [parser.from_file(t, server_str, xmlContent=False)["content"] for t in (pdf_files + wrd_files)]

In [51]:
# save data
pickle.dump(pdf_text, open( "C:/Thesis/Data/save/Personnel_Psychology/save_alltextData.p", "wb" ) )

#  Load the Text data

In [1]:
import pickle
#load the text data
pdf_text = pickle.load(open("C:/Thesis/Data/save/Personnel_Psychology/save_alltextData.p", "rb"))


In [2]:
len(pdf_text)

511

# Get country names

In [3]:
# Since it is case sensetive, firstly get the list of country before converting the text to lowercase
country_list = ['america', 'vietnam', 'usa']
import pycountry

for country in pycountry.countries:
#    for i in pdf_text:
#        if country.name in i:
            #print(country.name)
            country_list.append(country.name.lower())

In [38]:
#Convert country list to lower case. it will be called in filter function
#country_lower = [x.lower() for x in country_list]
#print(country_list)

# Get city names (and additional countries)

In [4]:
from geotext import GeoText

cities_list = []
for i in pdf_text:
    place = GeoText(i)
    #print(place.cities)
    #print(place.countries)
    cities_list.append(place.cities + place.countries)

t = []
for i in cities_list:
    t += i
    
print(len(t))    
    
cities_list = list(set(t))
# cities_list

38433


In [5]:
#Convert cities list to lower case. it will be called in filter function
cities_lower =[]
for i in cities_list:
    #for j in i:
    cities_lower.append(i.lower())
print(len(cities_lower))


1144


#  Convert pdf text to lower case

In [6]:
#Convert each string to lowercase
for i in range(len(pdf_text)):
    pdf_text[i] = pdf_text[i].lower()

# Excluding pdfs if it doesnot have the standard format 
check the standard format of pdf- if it contains method and discussion section or not

In [7]:
import re
pdf_included =[]
pdf_excluded =[]

for i in pdf_text:
    flag_1 = re.search(r'\n(general )?discussion\n', i)
    flag_2 = re.search(r'\nmethods?\n', i)
   
    if flag_1 and flag_2: 
        pdf_included.append(i)
     
    else:
        pdf_excluded.append(i)
len(pdf_included)

271

In [8]:
len(pdf_excluded)

240

In [9]:
# Within JM: acknowledgement section appears in two approaches:1: before or after keyword section (only few pdfs)
# 2: before reference section (maximum pdfs)
# thus, focusing on the majority occurance of acknowledgements before reference second, we removed it based on that condition.

# step-1 remove content after acknowledgements section( it will include appendix, reference section)
pdf_remove_step1 = [re.sub(r"(?is)\nacknowledgments\n.*", "", f) for f in pdf_included]

# step-2 if there is no acknowledgements section than start with appendix (it will include reference section.)
pdf_remove_step2 = [re.sub(r"(?is)\nappendix\n.*", "", f) for f in pdf_remove_step1]

# step-3 remove all from reference section
pdf_remove_step3 = [re.sub(r"(?is)\nreferences\n.*", "", f) for f in pdf_remove_step2]

# get the reference section
references = [re.sub(r"^(.|\n)+\nreferences\n", "", f) for f in pdf_included]

In [10]:
len(pdf_remove_step3)

271

In [11]:
# step-4: remove content between method and discussion section (it contains method and result section)
def remove_method_result(method, discussion, text):
    
    import re    
    start = method
    end = discussion
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    #pattern = start + '.*'+'(.|\n)*'+end
    pattern = start +'(.|\n)*'+end
    #pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]
    pdf_method_dis = re.sub(pattern, '', text)

    return pdf_method_dis

In [12]:
import re
#  remove method and result
pdf_remove_step4 = [remove_method_result('\nmethods?\n', '\n(general )?discussion\n', t) for t in pdf_remove_step3]

In [13]:
len(pdf_remove_step4)

271

# Pre-processing
Regular expression is used to clean up the data

In [14]:
# testing it works
ref_doc = 20
print(pdf_remove_step4[ref_doc])
print('--**************************************-')
print('-**************************************--')
print('-**************************************--')
# print(pdf_remove_step4[ref_doc])























































spotlight on age&#x02010;diversity climate: the impact of age&#x02010;inclusive hr practices on firm&#x02010;level outcomes


personnel psychology
2014, 67, 667–704

spotlight on age-diversity climate: the
impact of age-inclusive hr practices on
firm-level outcomes

stephan a. boehm
university of st. gallen

florian kunze
university of st. gallen

heike bruch
university of st. gallen

this study investigates the emergence and the performance effects of
an age-diversity climate at the organizational level of analysis. building
upon kopelman and colleagues’ (kopelman, brief, & guzzo, 1990) cli-
mate model of firm productivity as well as cox’s (1994) interactional
model of cultural diversity, we hypothesize a positive influence of age-
inclusive hr practices on the development of an organization-wide age-
diversity climate, which in turn should be directly related to collective
perceptions of social exchange and indirectly to firm perf

In [15]:
# remove journal title here?
pdf_p = [re.sub('journal of management', '', t) for t in pdf_remove_step4]
pdf_p = [re.sub('et al.', '', t) for t in pdf_p]
pdf_p = [re.sub('e.g.', '', t) for t in pdf_p] 
pdf_p = [re.sub('e-mail:', '', t) for t in pdf_p] 
# CHECK ORDER OF OPERATIONS BELOW:
#remove email address <-- IMPROVED, BUT E-MAIL SEEMS TO BE BROKEN UP, BUT STILL PRESENT IN DOCUMENTS AT TIMES         
pdf_p = [re.sub(r'[\w\.\-]+@[\w\.\-]+\.(?:com|net|org|live|edu)', "", t) for t in pdf_p]

In [16]:
pdf_p = [re.sub( r'https?://[^\s<>"]+|www\.[^\s<>"]+', "", t) for t in pdf_p]
pdf_p = [re.sub("[\S\.\-\?\!\=]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)(/[\S\.\-\?\!\=]*)?\s?", " ", t) for t in pdf_p]

In [17]:
pdf_p[20]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nspotlight on age&#x02010;diversity climate: the impact of age&#x02010;inclusive hr practices on firm&#x02010;level outcomes\n\n\npersonnel psychology\n2014, 67, 667–704\n\nspotlight on age-diversity climate: the\nimpact of age-inclusive hr practices on\nfirm-level outcomes\n\nstephan a. boehm\nuniversity of st. gallen\n\nflorian kunze\nuniversity of st. gallen\n\nheike bruch\nuniversity of st. gallen\n\nthis study investigates the emnce and the performance effects of\nan age-diversity climate at the organizational level of analysis. building\nupon kopelman and colles’ (kopelman, brief, & guzzo, 1990) cli-\nmate model of firm productivity as well as cox’s (1994) interactional\nmodel of cultural diversity, we hypothesize a positive influence of age-\ninclusive hr practices on the development of an organization-wide age-\ndiversity climate, which in turn should be directly related 

In [18]:
#remove hyphon only if it is at the end of the line.
#pdf_p = [re.sub(r'(\W[a-z]+)-\n(\W[a-z]+)', r'\1\2', t) for t in pdf_p]
pdf_p = [re.sub(r'-\n', '', t) for t in pdf_p]

In [19]:
#remove text within ()
pdf_p = [re.sub(r"\([^)]*\)", "", t) for t in pdf_p]
# remove text with [] eg citation
pdf_p = [re.sub(r"\[[^)]*\]", "", t) for t in pdf_p]
#Remove numbers
pdf_p = [re.sub('[0-9]+', '', f) for f in pdf_p]


In [20]:
pdf_p = [re.sub('[\n\r]+', ' ', f) for f in pdf_p] # remove newline and linebreak characters, replace with space
pdf_p = [re.sub('[@/%#$,=\+:’“”]', ' ', f) for f in pdf_p] # remove stray non-word characters

In [21]:
#pdf_p = [re.sub(r"\s\([A-Z][a-z]+,\s[A-Z][a-z]?\.[^\)]*,\s\d{4}\)", "", f) for f in pdf_p] # is this supposed to filter out citations?

# save all periods, exclamation marks and question marks, but get rid of unnecessary characters
pdf_p = [re.sub('[\.!?]+ ', 'XYZXYZ', t) for t in pdf_p]
pdf_p = [re.sub('-', 'ZYXXYZ', t) for t in pdf_p]
pdf_p = [re.sub('[\W_]+', ' ', f) for f in pdf_p]
pdf_p = [re.sub('XYZXYZ', '. ', f) for f in pdf_p]
pdf_p = [re.sub('ZYXXYZ', '-', f) for f in pdf_p]
#pdf_p[0]

#Remove placeholder x repetitions
pdf_p = [re.sub(' x+ ', '', f) for f in pdf_p]

# remove single characters
pdf_p = [re.sub(' [a-z] ' ,  ' ', f) for f in pdf_p]
pdf_p = [re.sub(' [a-z]\.', '.', f) for f in pdf_p]
pdf_p = [re.sub(' \.', '.', f) for f in pdf_p]

In [22]:
print(pdf_p[ref_doc])
print('---')
print('---')
print('---')
# print(pdf_p[ref_doc])

 spotlight on agediversity climate the impact of ageinclusive hr practices on firmlevel outcomes personnel psychology spotlight on age-diversity climate the impact of age-inclusive hr practices on firm-level outcomes stephan. boehm university of st. gallen florian kunze university of st. gallen heike bruch university of st. gallen this study investigates the emnce and the performance effects of an age-diversity climate at the organizational level of analysis. building upon kopelman and colles climate model of firm productivity as well as cox interactional model of cultural diversity we hypothesize positive influence of ageinclusive hr practices on the development of an organization-wide agediversity climate which in turn should be directly related to collective perceptions of social exchange and indirectly to firm performance and employees collective turnover intentions. the assumed relationships are tested in sample of german small and medium-sized companies with employees participati

In [28]:
def get_author_names(txt):

    refs = re.sub('[0-9]{4}.*\n','',txt)

    l = re.compile(" [a-z]\.").split(refs)
    l = [ll for ll in l if not bool(re.search('[0-9]+', ll))]
    l = [ll for ll in l if len(ll) < 35]
    l = [re.sub('[\n ,\&\.]', '', ll) for ll in l]
    l = [re.sub('[!(*);&$]', '', l1) for l1 in l]
    authors = [ll for ll in l if len(ll) > 1]

    return(authors)
#for n in names:
#    print(n)

#refs = re.sub('\n.*[0-9].*', '', refs)
#refs = re.sub('\n[^,]+\n', '\n', refs)
#print(refs)

In [29]:
authors = []
for r in references:
    authors += get_author_names(r)
    
author_nam = list(set(list(authors)))

# result is still not perfect, but I have not found any stray 'plain English' so far
print(author_nam[100:250])
print(len(author_nam))

['ames', 'herrnsteinrjmurray', 'abelelkrugermlfriedl', 'coffin', 'milesrsnow', 'cliff', 'cotessaksamzikic', 'blau', 'bandieraobarankayirasul', 'ticehallbandura', 'aikenlwest', '∗∗inwald', 'byrnedcloreglsmeaton', 'schneider', 'pressbass', 'riverspresslondon', 'banduraacervone', 'bandura', 'johns', 'frijda', 'in', 'prenticehallbandura', 'aguinishharden', 'softwareinternationalkizilos', 'duniewicz', 'duehrandjoyce', 'tavares', 'cohen', 'bagozzirpyi', 'antonioni', 'prensky', 'boyle', 'http://nbclatinocom/schmitt', 'schnabelkulittletdbaumert', 'bormanwmotowidlo', 'kleinmann', 'baltes', 'klehe', 'brotheridgecgrandey', '∗payne', 'janssens', 'granovetter', 'vanheil', 'fritzschebasalas', 'schneiderbbowen', 'bacontharenou', 'andtrendsin', 'festinger', 'lanlikert', 'silverhgoldscheider', 'meninthe', 'co∗schooncgkaleymstern', 'gill', 'granbergdholmberg', 'harperrowdecieryan', 'basicbooksgold', 'adams', 'vardiyweitz', 'flynnfames', 'jacob', 'rowshockley', 'duffymhenleclambert', 'fishbein', 'nelson'

In [None]:
# Authors names are saved already (code is available at author name folder-get name list notebook)
#load authors names
#import pickle
#author_nam = pickle.load(open("C:/Thesis/Data/save/Master_Data/auth_ref_filter_num_char/auth_Journal_of_management.p", "rb"))

In [30]:
# GET REFERENCE LISTS OF STOPWORDS, I.E. WORDS THAT ARE ESSENTIALLY MEANINGLESS
# Get all the list of authors names

from nltk.corpus import stopwords as wrd

sw = wrd.words('english') + author_nam + country_list + cities_lower

#print(sw)

In [31]:
#include corpus-specific stopwords
sw += ['vol', 'doi', 'article', 'articles', 'reuse', 'sagepub', 'journal', 'journals', 'com', 'quarter', 'quarterly', 'annual', 'proceedings']
sw += ['sage', 'editorial', 'report', 'publications', 'publication', 'publisher', 'jom', 'manuscript', 'manuscripts', 'submission', 'submissions']
sw += ['editor', 'editors', 'orcid', 'id', 'month', 'mailto', 'author', 'authors', 'homepage', 'sciencedirect', 'scopus']
sw += ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
sw += ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

# add a metric ton of generic terms
sw += ['a', 'about', 'above', 'across', 'after', 'afterwards']
sw += ['again', 'against', 'all', 'almost', 'alone', 'along']
sw += ['already', 'also', 'although', 'always', 'am', 'among']
sw += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
sw += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
sw += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
sw += ['because', 'become', 'becomes', 'becoming', 'been']
sw += ['before', 'beforehand', 'behind', 'being', 'below']
sw += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
sw += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
sw += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
sw += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
sw += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
sw += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
sw += ['every', 'everyone', 'everything', 'everywhere', 'except']
sw += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
sw += ['five', 'for', 'former', 'formerly', 'forty', 'found']
sw += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
sw += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
sw += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
sw += ['herself', 'him', 'himself', 'his', 'how', 'however']
sw += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
sw += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
sw += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
sw += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
sw += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
sw += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
sw += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
sw += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
sw += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
sw += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
sw += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
sw += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
sw += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
sw += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
sw += ['some', 'somehow', 'someone', 'something', 'sometime']
sw += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
sw += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
sw += ['then', 'thence', 'there', 'thereafter', 'thereby']
sw += ['therefore', 'therein', 'thereupon', 'these', 'they']
sw += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
sw += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
sw += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
sw += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
sw += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
sw += ['whatever', 'when', 'whence', 'whenever', 'where']
sw += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
sw += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
sw += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
sw += ['within', 'without', 'would', 'yet', 'you', 'your']
sw += ['yours', 'yourself', 'yourselves']

sw = list(set(sw))

In [32]:
# Function to filter out the stopwords and authors names
def filterWords(msg, words):
    m = msg.split(' ')
    words_end = [w + '.' for w in words]
    filtered_words = list(filter(lambda word: word not in words, m))
    filtered_words = [(w if w not in words_end else '.') for w in filtered_words]
    result = ' '.join(filtered_words)
    return(result)

In [33]:
# GET RID OF THE STOPWORDS IN TEXTS
data_proc = [filterWords(f, sw) for f in pdf_p]  

In [34]:
import pickle
# exclude author list from metadata
author_metadata = pickle.load(open("C:/Thesis/Data/save/Master_Data/author_list/author_list.p", "rb"))

In [35]:
# GET RID OF THE author names (extracted from the metadata)
data_proc_1 = [filterWords(f, author_metadata) for f in data_proc] #running

In [37]:
data_proc_1[ref_doc]

' agediversity climate impact ageinclusive hr practices firmlevel outcomes personnel psychology age-diversity climate impact age-inclusive hr practices firm-level outcomes . st. gallen st. gallen heike st. gallen study investigates emnce performance effects age-diversity climate organizational level analysis. building colles climate model firm productivity cox interactional model cultural diversity hypothesize positive influence ageinclusive hr practices development organization-wide agediversity climate turn directly related collective perceptions exchange indirectly firm performance employees collective turnover intentions. assumed relationships tested sample medium-sized companies employees participating. circumvent common source problems information various constructs gathered different sources. test assumed relationships structural equation modeling executed bootstrapping procedures test significance indirect effects. received support assumed relationships. paper concludes practic

In [38]:
data_proc2 = [re.sub("([a-z]+\.)+[a-z]+","", f) for f in data_proc_1]
data_proc2 = [re.sub('\s([?\.!"](?:\s|$))' , r'\1', f) for f in data_proc2]
data_proc2 = [re.sub("^\s+","", f) for f in data_proc2]
data_proc2 = [re.sub("\s+\Z","", f) for f in data_proc2]
data_proc2 = [re.sub("(\.)+",".", f) for f in data_proc2]
data_proc2 = [re.sub(" +"," ", f) for f in data_proc2]
data_proc2 = [re.sub("( +\.)",".", f) for f in data_proc2]
data_proc2 = [re.sub("\.$","", f) for f in data_proc2]

In [39]:
print(data_proc2[ref_doc].split('. '))

sents = []
for d in data_proc2:
    sents += d.split('. ')

['agediversity climate impact ageinclusive hr practices firmlevel outcomes personnel psychology age-diversity climate impact age-inclusive hr practices firm-level outcomes', 'st', 'gallen st', 'gallen heike st', 'gallen study investigates emnce performance effects age-diversity climate organizational level analysis', 'building colles climate model firm productivity cox interactional model cultural diversity hypothesize positive influence ageinclusive hr practices development organization-wide agediversity climate turn directly related collective perceptions exchange indirectly firm performance employees collective turnover intentions', 'assumed relationships tested sample medium-sized companies employees participating', 'circumvent common source problems information various constructs gathered different sources', 'test assumed relationships structural equation modeling executed bootstrapping procedures test significance indirect effects', 'received support assumed relationships', 'pape

In [40]:
print(sents[0:100])

['job seeking retirees seeking bridge employment personnel psychology job seeking retirees seeking bridge employment department psychology wisconsin rau college administration wisconsin using sample recent retirees study described sought test thneral propositions wanb watt rumsey model including specific variables likely relevant older adults seeking work retirement integrating recent variabloups suggested recent wanb kantrowitz', 'generally results support efficacy wanb model predict job seeking group', 'biographical variables older worker job search constraints self-evaluations motive variables related job seeking', 'expected direction', 'similarities differences regular job seeking bridge employment job seeking discussed', 'trends increased current projected prevalence continued employment officially retired', 'important trend bridge employment substantial increase expected number older adults united states current level approximately', 'million adults age years million', 'older adu

In [41]:
# save data
pickle.dump(data_proc2, open( "C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed.p", "wb" ) )

In [42]:
import io

# writing documents one sentence per line to form a larger data set (allowing use of a generator for model training):
with open("C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed_txt.txt", "a",encoding="utf-8") as outfile:
    
    for s in sents:
        outstring = s + '\n'
        outfile.write(outstring)
        
# (additionally you could keep document boundaries intact, 
# e.g. by writing a file per document or marking start and end of articles in the larger file)

In [None]:
#import io
#with open("C:/Thesis/Data/save/Master_Data/MD_3/data_proc_latest/JM_data_proc_txt.txt", "w",encoding="utf-8") as outfile:
#    for i in range(len(data_proc)):
#        outstring = ""
#        outstring += str(data_proc[i])
#        outfile.write(outstring)