In [1]:
import os
import re
import pandas as pd
import urllib
from nltk import SnowballStemmer
from gensim.models import Word2Vec
#import langdetect
import tika
import time
from tika import parser
import pickle

# Raw Articles pdf Data
Convert pdf to text using Tika apache server 

In [2]:
# specify the target data folder
target_dir = 'C:/Thesis/Data/Journal_of_organizational_behavior'

In [3]:
# prep pdf extraction
pdf_files = []
wrd_files = []

for f in os.listdir(target_dir):
    if f.endswith(".pdf") | f.endswith(".PDF"):
        thispdf = os.path.join(target_dir, f)
        pdf_files = pdf_files + [thispdf]
    if f.endswith(".doc") | f.endswith(".docx") | f.endswith(".DOC") | f.endswith(".DOCX"):
        thiswrd = os.path.join(target_dir, f)
        wrd_files = wrd_files + [thiswrd]

In [4]:
port = 4321 # port to use for Tika server (chosen arbitrarily)
server_str = 'http://localhost:' + str(port)
tika_path  = 'C:\Software\ tika-server-1.23.jar'
tika_run   = 'java -jar' + tika_path + ' --port ' + str(port)
print('Command to start Tika:')
print(tika_run)
print('---')
print('Number of pdf files: ' + str(len(pdf_files)))
print('Number of word files: ' + str(len(wrd_files)))

Command to start Tika:
java -jarC:\Software\ tika-server-1.23.jar --port 4321
---
Number of pdf files: 1168
Number of word files: 0


# PDF metadata

In [5]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_metadata = [parser.from_file(t, server_str, xmlContent=False)["metadata"] for t in (pdf_files + wrd_files)]

In [9]:
# save data metadata
pickle.dump(pdf_metadata, open( "C:/Thesis/Data/save/Journal_of_organizational_behavior/metadata/metadata_all.p", "wb" ) )

# Text Data

Eliminate title, sections (eg: content between Method and Discussion) and Reference. It is saved as a pre-processed data

In [None]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

#pdf_text = [parser.from_file(t, server_str, xmlContent=False)["content"] for t in (pdf_files + wrd_files)]

# step-1 Load pdf content


In [6]:
import pickle
#load the text data
pdf_text = pickle.load(open("C:/Thesis/Data/save/Journal_of_organizational_behavior/save_alltextData.p", "rb"))


In [None]:
# step-2 convert it to lowercase

In [7]:
#Convert each string to lowercase
for i in range(len(pdf_text)):
    pdf_text[i] = pdf_text[i].lower()

In [None]:
# step-3 remove reference

In [8]:
# remove references section 
import re
pdf_ref = [re.sub(r"(?is)\nreferences\n.+", "", f) for f in pdf_text]

In [None]:
# step-4 load metadata


In [10]:
import pickle
#load the text data
pdf_metadata = pickle.load(open("C:/Thesis/Data/save/Journal_of_organizational_behavior/metadata/metadata_all.p", "rb"))

In [None]:
# step-5 get metadata = author, subject, title

In [27]:
pdf_title = []
pdf_Author = []
pdf_subject = []

for i in pdf_metadata:
    try:
        temp = i['title'] # got an error - multiple repeat at position 73
        auth_temp = i['Author']
        subj_temp = i['subject']
      
        pdf_title.append(temp.lower()) # 
        pdf_Author.append(auth_temp) # author will be added in the stopwords list
        pdf_subject.append(subj_temp.lower()) # subject is removed 
        
    except:
        #print("An exception occurred" )
        pdf_title.append(temp.lower())
        pdf_Author.append(auth_temp)
        pdf_subject.append(subj_temp.lower())

In [28]:
pdf_Author

['',
 '',
 '',
 '',
 '',
 '',
 '',
 'Ines Meyer, Stuart C. Carr, Lori Foster',
 'Ines Meyer, Stuart C. Carr, Lori Foster',
 'Ines Meyer, Stuart C. Carr, Lori Foster',
 'Ines Meyer, Stuart C. Carr, Lori Foster',
 'Ines Meyer, Stuart C. Carr, Lori Foster',
 'Joseph A. Allen, Nale Lehmann-Willenbrock, Steven G. Rogelberg',
 '',
 '',
 '',
 '',
 '',
 'David Antons, Mathieu Declerck, Kathleen Diener, Iring Koch, Frank T. Piller',
 'David Antons, Mathieu Declerck, Kathleen Diener, Iring Koch, Frank T. Piller',
 'David Antons, Mathieu Declerck, Kathleen Diener, Iring Koch, Frank T. Piller',
 'Kara A. Arnold, Catherine E. Connelly, Ian R. Gellatly, Megan M. Walsh, Michael J. Withey',
 'Kara A. Arnold, Catherine E. Connelly, Ian R. Gellatly, Megan M. Walsh, Michael J. Withey',
 'Kara A. Arnold, Catherine E. Connelly, Ian R. Gellatly, Megan M. Walsh, Michael J. Withey',
 'Kara A. Arnold, Catherine E. Connelly, Ian R. Gellatly, Megan M. Walsh, Michael J. Withey',
 'Susan J. Ashford, Ned Wellman, M

In [30]:
# pre-process author
import re
pdf_auth = []

for j in pdf_Author:
    k = re.sub(' and ', '', str(j))
    k = re.sub('[!;&$]', '', k)
    k = re.sub('[0-9]+', '', k)
    k = re.sub( r'\b\w{1,3}\b', '', k)
    k = re.sub( r'-', ' ', k)
#     pdf_p = [re.sub(' +', ' ', f) for f in pdf_p]
    k = re.sub(' +', ',', k)
    k = k.split(",")
   
    pdf_auth.append(k)
    

In [31]:
pdf_auth

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['Ines', 'Meyer', '', 'Stuart', '.', 'Carr', '', 'Lori', 'Foster'],
 ['Ines', 'Meyer', '', 'Stuart', '.', 'Carr', '', 'Lori', 'Foster'],
 ['Ines', 'Meyer', '', 'Stuart', '.', 'Carr', '', 'Lori', 'Foster'],
 ['Ines', 'Meyer', '', 'Stuart', '.', 'Carr', '', 'Lori', 'Foster'],
 ['Ines', 'Meyer', '', 'Stuart', '.', 'Carr', '', 'Lori', 'Foster'],
 ['Joseph',
  '.',
  'Allen',
  '',
  'Nale',
  'Lehmann',
  'Willenbrock',
  '',
  'Steven',
  '.',
  'Rogelberg'],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['David',
  'Antons',
  '',
  'Mathieu',
  'Declerck',
  '',
  'Kathleen',
  'Diener',
  '',
  'Iring',
  'Koch',
  '',
  'Frank',
  '.',
  'Piller'],
 ['David',
  'Antons',
  '',
  'Mathieu',
  'Declerck',
  '',
  'Kathleen',
  'Diener',
  '',
  'Iring',
  'Koch',
  '',
  'Frank',
  '.',
  'Piller'],
 ['David',
  'Antons',
  '',
  'Mathieu',
  'Declerck',
  '',
  'Kathleen',
  'Diener',
  '',
  'Iring',
  'Koch',
  '',
  'Frank',
  '.',
  'Piller']

In [29]:
# got error 
# pdf_text_title = []

# for i in pdf_ref:
#     line = i
#     # for author
#     for j in pdf_title:
#         line = re.sub(j, '', line)
        
#     pdf_text_title.append(line)


In [19]:
import re
# step-1 preprocess subject 
pdf_sub = []

  # pre-processing for subject
for i in pdf_subject:
    # remove special character
    t = (re.sub(r'[?\-|.|:|!]','',i))
    # remove digits
    t = (re.sub("\d+", "", t))
    # remove end whitespace
    t = re.sub(r"\s+$","",t)
    # append to the list
    pdf_sub.append(t) 

In [22]:
pdf_text_subject = [] #running

for i in pdf_ref:
    line = i

    for j in pdf_sub:
        line = re.sub(j, '', line)
        
    pdf_text_subject.append(line)

In [23]:
pdf_text_subject[2]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfirm or subgroup culture: where does fitting in matter most?\n\n\n\n\nj. organiz. behav. 25, 969–978 (2004)\n\npublished online in wiley interscience (www.interscience.wiley.com). doi: 10.1002/job.291\n\nfirm or subgroup culture: where does\nfitting in matter most?\n\nbryan adkins1 and david caldwell2*\n1denison consulting, half moon bay, california, u.s.a.\n2leavey school of business, santa clara university, santa clara, california, u.s.a.\n\nsummary using the organizational culture profile (ocp), this research investigated the extent to which\nfit between individuals (n¼ 136) and their competency groups (p–g fit) and the organization\n(p–o fit) were related to job satisfaction. even in a consulting firm with a strong organization\nculture, we found small, but interpretable, differences between the value-based cultures of the\ncompetency groups in the firm. although the two forms of fit we

In [None]:
# not required
def eliminate_method_result(method, discussion):
    
    import re    
    start = method
    end = discussion
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    pattern = start + '.*'+'(.|\n)*'+end
    pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]

    return pdf_method_dis

In [None]:
# not required
def eliminate_method_result_ref(method, dis, ref):
    
    import re

    start = method
    end = dis
    
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    pattern = start + '.*'+'(.|\n)*'+end
    pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]
    
    start_ref = ref
    pattern_ref = start_ref +'(.|\n)*$'

    pdf_ref= [re.sub(pattern_ref, '', f) for f in pdf_method_dis]
    
    return pdf_ref

#  Pre-processing
Regular expression is used to clean up the pre-processed data and saved as a Master data

In [9]:
import re
pdf_p= [re.sub(r"-\n(\n)*", "", t)for t in pdf_ref]

pdf_p= [re.sub(r"\n(\n)*", "\n", t)for t in pdf_p]

pdf_p= [re.sub(r"\n", " ", t)for t in pdf_p]
# remove text with [] eg citation
pdf_p = [re.sub(r"\[[^)]*\]", "", t)for t in pdf_p]
#Remove numbers
pdf_p = [re.sub('[0-9]+', '', f) for f in pdf_p]
#pdf_p = [re.sub("r[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?", " ", t)for t in pdf_p]
pdf_p = [re.sub('[!@/%“”‘:#©β<>+=δχ*&$]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('’s','', f) for f in pdf_p]
pdf_p = [re.sub('’re','', f) for f in pdf_p]
pdf_p = [re.sub('’t','', f) for f in pdf_p]
pdf_p = [re.sub('’ve','', f) for f in pdf_p]
pdf_p = [re.sub('’ll','', f) for f in pdf_p]
pdf_p = [re.sub('’m','', f) for f in pdf_p]

pdf_p = [re.sub('[:()-]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('[\.!?]+ ', 'XYZXYZ', t) for t in pdf_p]
# \w will match alphanumeric characters and underscores
# [^\w] will match anything that's not alphanumeric or underscore
pdf_p = [re.sub(r'[^\w]', ' ', f) for f in pdf_p]
pdf_p = [re.sub('XYZXYZ', '.', f) for f in pdf_p]
pdf_p = [re.sub(' +', ' ', f) for f in pdf_p]
# Replace multiple dots with one dot
pdf_p = [re.sub('\.\.+',".", f) for f in pdf_p]
pdf_p = [re.sub(r'\b\w{1,3}\b',"", f) for f in pdf_p]
pdf_p = [re.sub(' +', ' ', f) for f in pdf_p]

pdf_p = [re.sub("^\s+","", f) for f in pdf_p]# remove the front space
pdf_p = [re.sub("\s+\Z","", f) for f in pdf_p]#remove the back space

In [None]:
author_list = ["bryan", "acton","roseanne", "foti", "robert", "lord", "jessica", "gladfelter", "mats"]
author_list += ["alvesson","katja", "einola", 'steina', 'jantonak', "john", "antonakis", "samuel", "bendahan", "philippe"]
author_list += ["jacquart", "rafael", "lalive", "george", "banks", "nicolas", "bastardoz", "michael", "cole", "david","alice"]
author_list += ["eagly", "olga", "epitropaki"]
author_list += ["william", "gardner", ".", 'alexander', "haslam", "hogg", "ronit"]
author_list += ["kark", "kevin", "lowe", "philip", "podsakoff", "seth", "spain", "janka", "stoker", "niels", "quaquebeke"]
author_list += ["mark", "vugt", "dusya", "vera", "roberto", "weber", "nicolas", "bastardoz","mark", "vugt", 'stephane']
author_list += ['brutus', 'shawn', 'burke', 'dana', 'sims', 'elizabeth', 'lazzara', 'eduardo', 'salas', 'abraham', 'carmeli']
author_list += ['meyrav', 'yitzack', 'halevi', "david", "carrington", "combe", "mumford", 'jingnan', 'chen', "minyoung", "cheong"]
author_list += ["francis", "yammarino", "shelley", "dionne", "chou", "tsai",'cheng', "steve","shin", 'guang']
author_list += ["liang", "amon", "chizema","ganna", "pogrebna", 'neil', 'stewart', "joseph", "crawford",'anne', "kelder", "stéphane"]
author_list += ["côté", "paulo", "lopes", "peter", "salovey", "christopher", "miners"]
author_list += ['uqbweave', 'david',"david", "hock", 'peng',"belle", "derks","colette", "laar","naomi", "ellemers", 'shelley', 'dionne']
author_list += ['yvonne', 'budden', 'nathan',"bassam", "farah", "rida"]
author_list += ["elias", "cristine", "clercy", "glenn", "rowe", 'christopher', 'watkins', "louis", "sean", "hannah", "noel"]
author_list += ["fred", "walumbwa", "zachary", "garfield", "rueden", "edward", "hagen", 'geys', 'steffen', 'giessner']
author_list += ['daan', 'knippenberg', 'sleebos', "laura", "giurge", "marius", "dijke", "michelle", "zheng", "cremer", 'design']
author_list += ['unit', 'london', 'school', 'economics','activepdf', 'kelly', 'hannum', "anna", 'luca', "heimann"]
author_list += ["ingold", "martin", "kleinmann", "nathan", "hiller", "hock", 'peng', "ajay", "ponnapalli", "sibel", "ozgen"]
author_list += ['crystal', 'hoyt', 'stefanie', 'johnson', 'susan', 'elaine', 'murphy', 'kerry', 'hogue', 'skinnell', 'hendrik']
author_list += ['huettermann', 'sebastian', 'doering', 'sabine', 'boerner', "hughes", "allan", "tian", "alex", "newman", "alison"]
author_list += ["legood", 'center', 'applied', 'social', 'research', 'dongil', 'jung', 'francis', 'yammarino',"thomas"]
author_list += ["kelemen", "matthews", "kimberley", "breevaart", 'michael', 'kosfeld', "lindie", "liang", "douglas", "brown"]
author_list += ["huiwen", "lian", "hanig", 'lance', "ferris", "lisa", "keeping", "jukka", "lipponen", "janne", "kaltiainen"]
author_list += ["werff", "niklas", "steffens", "jeffrey", "lovelace", "brett", "neely", "julian", "allen", "hunter"]
author_list += ['bourgoin', "charles", "reilly", "bernadette", "doerr", "caldwell", "jennifer", "chatman", "reilly"]
author_list += ['dawn', 'eubanks', "philip", "podsakoff","nathan", "podsakoff", "therese", "reitan","sten"]
author_list += ["stenberg", 'chester', 'schriesheim', 'joshua', 'terri', 'scandura', 'jeroen', 'staff', "maria"]
author_list += ["tims", "arnold", "bakker", "despoina", "xanthopoulou", "chou", "tsai", 'chih', "wang", 'shiuan', "cheng"]
author_list += ["herman","huang","wing",'hardin','kragt','djohnston', 'billings']
author_list += ["fred", "walumbwa", "peng", "john", "schaubroeck", "bruce", "avolio", 'user', "philip", "yang"]
author_list += ["riepe", "katharina", "moser", "kerstin", "pull", "siri", "terjesen", "christian", "zehnder", "holger"]
author_list += ["herz", "jean", 'philippe', "bonardi", 'school', 'business', 'msmhh']
author_list += ['ambika', 'ambika', 'natalie', 'allen', 'tracy', 'hecht', 'neil', 'anderson', 'sarah', 'sleap', 'nikos']
author_list += ['bozionelos', 'adam', 'butler', 'amie', 'skattebo', 'john', 'cordery', 'kevin', 'daniels', 'claire', 'harris']
author_list += ['briner', 'taru', 'feldt', 'mika', 'kivimaumlki', 'anne', 'rantala', 'asko', 'tolvanen', 'donald', 'gardner']
author_list += ['linn', 'dyne', 'pierce', 'filip', 'lievens', 'frederik', 'anseel', 'adam', 'meade', 'james', 'meindl']
author_list += ['paul', 'paulus', 'karen', 'charlie', 'reeve', 'eric', 'heggestad', 'astrid', 'richardsen','zhao','zhou','jing']
author_list += ['monica', 'martinussen', 'anit', 'somech', 'anat', 'drach', 'zahavy', 'gigi', 'sutton', 'mark', 'griffin']
author_list += ['william', 'turnley', 'mark', 'bolino', 'scott', 'lester', 'james', 'bloodgood', 'joan', 'horn', 'toon', 'taris']
author_list += ['wilmar', 'schaufeli', 'paul', 'schreurs', 'fred', 'walumbwa', 'peng', 'wang', 'john', 'lawler']
author_list += ['michael', 'west', 'felix', 'brodbeck', 'andreas', 'richter', 'mark', 'wilson', 'david', 'dejoy']
author_list += ['robert', 'vandenberg', 'hettie', 'richardson', 'allison', 'mcgrath', 'ambika','david','micheal']
author_list += ['elena','belogolovsky','peter','bamberger','karlene','roberts','chris','bingham','campbell','seung','hwan','jeong','scott','graffin']

author_list += ['robert','messen','andrew','carton','brice','roberts','dattée','oliver','alexy','erkko','autio','sreedhari','desai','patricio']

author_list += ['duran','nadine','kammerlander','marc','essen','thomas','zellweger','john','joseph','wshong','gokhan','ertug','tamar','yogev']


author_list += ['yonghoon','lee','peter','hedström','kibler','ewald','markus','perkmann','brenda','flannery','douglas','michael','florin','lubatkin']


author_list += ['william','schulze','raghu','garud','sanjay','jain','arun','kumaraswamy','gerard','george','christopher','corbishley','jane','khayesi']


author_list += ['martine','haas','laszlo','tihanyi','wakenshaw','gibbons','deborah','brian','gunia','joo','hun','han','saehee','kang']



author_list += ['rebecca','kehoe','david','lepak','hausknecht','nathan','hiller','robert','vance','hitt','tina','dacin','edward','levita']



author_list += ['jean','arregle','luc','anca','borza','fmurray','philippe','jacquart','antonakis','jason','jay','fuller','elfenbeinh','cynthia']

author_list += ['kim','jensen','lorraine','crystal','hkpu','margaret','luciano','amy','bartels','lauren','innocenzo','travis','maynard','mathieu']


author_list += ['hardy','smagui','johanna','mair','ignasi','marti','marc','ventresca','dellring','jcarson','mmcardle','erez','mia','mathieu']


author_list += ['monin','niels','noorderhaven','eero','vaara','kroon','jeroen','neckebrouck','gina','dokko','lisa','hisae','nishii','sonja']


author_list += ['opper','donde','ashmos','plowman','lakami','baker','tammy','beck','mukta','kulkarni','stephanie','solansky','deandra','villarreal']

author_list += ['ridge','ingram','aaron','hill','bloy','drew','harry','dwight','lemke','richard','dino','staf','guillaume','soenen']

author_list += ['tessa','melkonian','maureen','ambrose','evans','bennett','tepper','nikolaos','dimotakis','schurer','lambert','joel','koopman','fadel']


author_list += ['matta','hee','man','park','wongun','goo','tepper','varkey','titus','owen','parker','francesca','gino','bass']


author_list += ['erin','charlie','trevor','home','computer','martins','martín','esmt','european','varkey','gmbh','cwbauman','school','francesca','gino','bass']

In [None]:
# Function to filter out the stopwords and authors names
def filterWords(msg, words):
    m = msg.split(' ')
    words_end = [w + '.' for w in words]
    filtered_words = list(filter(lambda word: word not in words, m))
    filtered_words = [(w if w not in words_end else '.') for w in filtered_words]
    result = ' '.join(filtered_words)
    return(result)

In [None]:
data_proc_auth = [filterWords(f, author_list) for f in pdf_p]

In [14]:
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['january', 'february', 'march', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
# metadata- subject
stopwords += ['business administration', 'economics and finance']
# author name to remove
stopwords +=['karina nielsen','kevin daniels','elaine', 'jaewan yang','andrew O herdman','amanda', 'sabine','kerstin','kertin','sagepub','journalspermissions','catheine','chidiebere ogbonnaya','john wiley','sons ltd','martin','jeffrey B arthur']

In [15]:
data_proc = [filterWords(f, stopwords) for f in data_proc_auth]

In [None]:
# Get country name

In [None]:
import pycountry

country_list = []
for country in pycountry.countries:
    a = (country.name).lower()
    country_list.append(a) 

In [None]:
data_proc_country =[]
for j in data_proc:
    line = j
    for k in country_list:
        line = re.sub(k, '', line)
        
    data_proc_country.append(line)

In [None]:
data_proc_country[6]

In [None]:
 save removed title
pickle.dump(data_proc_country, open( "C:/Thesis/Data/save/Journal_of_organizational_behavior/metadata/removed_country/metadata_all.p", "wb" ) )

In [None]:
# Authors names are saved already (code is available at author name folder-get name list notebook)
#load authors names
import pickle
author_nam = pickle.load(open("C:/Thesis/Data/save/Master_Data/auth_ref_filter_num_char/auth_Journal_of_organizational_behavior.p", "rb"))

In [None]:
# GET REFERENCE LISTS OF STOPWORDS, I.E. WORDS THAT ARE ESSENTIALLY MEANINGLESS
# Get all the list of authors names

from nltk.corpus import stopwords

sw = stopwords.words('english') + author_nam + country_lower + cities_lower

In [None]:
data_proc_1 = [filterWords(f, sw) for f in data_proc_country]

In [16]:
data_proc_2 = [re.sub(' \.+', ' ', f) for f in data_proc_1]
data_proc_2 = [re.sub(' +', ' ', f) for f in data_proc_2]
data_proc_2 = [re.sub('http', '', f) for f in data_proc_2]
data_proc_2 = [re.sub('www', '', f) for f in data_proc_2]
data_proc_2 = [re.sub('iweb', '', f) for f in data_proc_2]
data_proc_2 = [re.sub(' +', ' ', f) for f in data_proc_2]


In [17]:
#data_proc

In [18]:
# save data
pickle.dump(data_proc_2, open( "C:/Thesis/Data/save/Master_Data/MD_4/data_proc_latest/J_OF_OB_data_proc.p", "wb" ) )

In [19]:
#import io
with open("C:/Thesis/Data/save/Master_Data/MD_4/data_proc_latest/J_OF_OB_data_proc_txt.txt", "w",encoding="utf-8") as outfile:
    for i in range(len(data_proc_2)):
        outstring = ""
        outstring += str(datadata_proc_2_proc[i])
        outfile.write(outstring)