In [37]:
import os
import re
import pandas as pd
import urllib
from nltk import SnowballStemmer
from gensim.models import Word2Vec
#import langdetect
import tika
import time
from tika import parser
import pickle

# Raw Articles pdf Data
Convert pdf to text using Tika apache server 

In [38]:
# specify the target data folder

target_dir = 'C:/Thesis/Data/Academy_of_Management_journal'

In [39]:
# prep pdf extraction
pdf_files = []
wrd_files = []

for f in os.listdir(target_dir):
    if f.endswith(".pdf") | f.endswith(".PDF"):
        thispdf = os.path.join(target_dir, f)
        pdf_files = pdf_files + [thispdf]
    if f.endswith(".doc") | f.endswith(".docx") | f.endswith(".DOC") | f.endswith(".DOCX"):
        thiswrd = os.path.join(target_dir, f)
        wrd_files = wrd_files + [thiswrd]

In [40]:
port = 4321 # port to use for Tika server (chosen arbitrarily)
server_str = 'http://localhost:' + str(port)
tika_path  = 'C:\Software\ tika-server-1.23.jar'
tika_run   = 'java -jar' + tika_path + ' --port ' + str(port)
print('Command to start Tika:')
print(tika_run)
print('---')
print('Number of pdf files: ' + str(len(pdf_files)))
print('Number of word files: ' + str(len(wrd_files)))

Command to start Tika:
java -jarC:\Software\ tika-server-1.23.jar --port 4321
---
Number of pdf files: 1285
Number of word files: 0


In [64]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_metadata = [parser.from_file(t, server_str, xmlContent=False)["metadata"] for t in (pdf_files + wrd_files)]

In [None]:
# save data metadata
pickle.dump(pdf_metadata, open( "C:/Thesis/Data/save/Academy_of_MJ/metadata/metadata_academy.p", "wb" ) )

# Text Data

Eliminate title, sections (eg: content between Method and Discussion) and Reference. It is saved as a pre-processed data

In [50]:
# process the PDF files, store text in memory
tika.TikaClientOnly = True 

pdf_text = [parser.from_file(t, server_str, xmlContent=False)["content"] for t in (pdf_files + wrd_files)]

In [51]:
# save data
pickle.dump(pdf_text, open( "C:/Thesis/Data/save/Academy_of_MJ/save_alltextData.p", "wb" ) )

#  Load the Text data

In [1]:
import pickle
#load the text data
pdf_text = pickle.load(open("C:/Thesis/Data/save/Academy_of_MJ/save_alltextData.p", "rb"))


In [2]:
len(pdf_text)

410

# Get country names

In [3]:
# Since it is case sensetive, firstly get the list of country before converting the text to lowercase
country_list = ['america', 'vietnam', 'usa']
import pycountry

for country in pycountry.countries:
#    for i in pdf_text:
#        if country.name in i:
            #print(country.name)
            country_list.append(country.name.lower())

In [4]:
#Convert country list to lower case. it will be called in filter function
#country_lower = [x.lower() for x in country_list]
#print(country_list)

# Get city names (and additional countries)

In [6]:
from geotext import GeoText

cities_list = []
for i in pdf_text:
    place = GeoText(i)
    #print(place.cities)
    #print(place.countries)
    cities_list.append(place.cities + place.countries)

t = []
for i in cities_list:
    t += i
    
# print(len(t))    
    
cities_list = list(set(t))
# cities_list

In [7]:
#Convert cities list to lower case. it will be called in filter function
cities_lower =[]
for i in cities_list:
    #for j in i:
    cities_lower.append(i.lower())
print(len(cities_lower))


1202


#  Convert pdf text to lower case

In [8]:
#Convert each string to lowercase
for i in range(len(pdf_text)):
    pdf_text[i] = pdf_text[i].lower()

# Excluding pdfs if it doesnot have the standard format 
check the standard format of pdf- if it contains method and discussion section or not

In [9]:
import re
pdf_included =[]
pdf_excluded =[]

for i in pdf_text:
    flag_1 = re.search(r'\n(general )?discussion\n', i)
    flag_2 = re.search(r'\nmethods?\n', i)
   
    if flag_1 and flag_2: 
        pdf_included.append(i)
     
    else:
        pdf_excluded.append(i)
len(pdf_included)

145

In [10]:
len(pdf_excluded)

265

In [11]:
# Within JM: acknowledgement section appears in two approaches:1: before or after keyword section (only few pdfs)
# 2: before reference section (maximum pdfs)
# thus, focusing on the majority occurance of acknowledgements before reference second, we removed it based on that condition.

# step-1 remove content after acknowledgements section( it will include appendix, reference section)
pdf_remove_step1 = [re.sub(r"(?is)\nacknowledgments\n.*", "", f) for f in pdf_included]

# step-2 if there is no acknowledgements section than start with appendix (it will include reference section.)
pdf_remove_step2 = [re.sub(r"(?is)\nappendix\n.*", "", f) for f in pdf_remove_step1]

# step-3 remove all from reference section
pdf_remove_step3 = [re.sub(r"(?is)\nreferences\n.*", "", f) for f in pdf_remove_step2]

# get the reference section
references = [re.sub(r"^(.|\n)+\nreferences\n", "", f) for f in pdf_included]

In [12]:
len(pdf_remove_step3)

145

In [14]:
# step-4: remove content between method and discussion section (it contains method and result section)
def remove_method_result(method, discussion, text):
    
    import re    
    start = method
    end = discussion
    #start of pattern, followed by any number of times 'any character OR a newline' and terminated by the end pattern.
    #pattern = start + '.*'+'(.|\n)*'+end
    pattern = start +'(.|\n)*'+end
    #pdf_method_dis = [re.sub(pattern, '', f) for f in pdf_text]
    pdf_method_dis = re.sub(pattern, '', text)

    return pdf_method_dis

In [15]:
import re
#  remove method and result
pdf_remove_step4 = [remove_method_result('\nmethods?\n', '\n(general )?discussion\n', t) for t in pdf_remove_step3]

In [16]:
len(pdf_remove_step4)

145

# Pre-processing
Regular expression is used to clean up the data

In [17]:
# testing it works
ref_doc = 20
print(pdf_remove_step4[ref_doc])
print('--**************************************-')
print('-**************************************--')
print('-**************************************--')
# print(pdf_remove_step4[ref_doc])




















































from the head and the heart:
locating cognition- and affect-based trust in

managers’ professional networks

roy yong joo chua
paul ingram

michael w. morris
columbia university

this article investigates the configuration of cognition- and affect-based trust in man-
agers’ professional networks, examining how these two types of trust are associated
with relational content and structure. results indicate that cognition-based trust is
positively associated with economic resource, task advice, and career guidance ties,
whereas affect-based trust is positively associated with friendship and career guidance
ties but negatively associated with economic resource ties. the extent of embeddedness
in a network through positive ties increases affect-based trust, whereas that through
negative ties decreases cognition-based trust. these findings illuminate how trust
arises in networks and inform network research that invokes trust to explain mana-

In [18]:
# remove journal title here?
pdf_p = [re.sub('journal of management', '', t) for t in pdf_remove_step4]
pdf_p = [re.sub('et al.', '', t) for t in pdf_p]
pdf_p = [re.sub('e.g.', '', t) for t in pdf_p] 
pdf_p = [re.sub('e-mail:', '', t) for t in pdf_p] 
# CHECK ORDER OF OPERATIONS BELOW:
#remove email address <-- IMPROVED, BUT E-MAIL SEEMS TO BE BROKEN UP, BUT STILL PRESENT IN DOCUMENTS AT TIMES         
pdf_p = [re.sub(r'[\w\.\-]+@[\w\.\-]+\.(?:com|net|org|live|edu)', "", t) for t in pdf_p]

In [19]:
pdf_p = [re.sub( r'https?://[^\s<>"]+|www\.[^\s<>"]+', "", t) for t in pdf_p]
pdf_p = [re.sub("[\S\.\-\?\!\=]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)(/[\S\.\-\?\!\=]*)?\s?", " ", t) for t in pdf_p]

In [20]:
pdf_p[20]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfrom the head and the heart:\nlocating cognition- and affect-based trust in\n\nmanagers’ professional networks\n\nroy yong joo chua\npaul ingram\n\nmichael w. morris\ncolumbia university\n\nthis article investigates the configuration of cognition- and affect-based trust in man-\nagers’ professional networks, examining how these two types of trust are associated\nwith relational content and structure. results indicate that cognition-based trust is\npositively associated with economic resource, task advice, and career guidance ties,\nwhereas affect-based trust is positively associated with friendship and career guidance\nties but negatively associated with economic resource ties. the extent of embeddedness\nin a network through positive ties increases affect-based trust, whereas that through\nnegative ties decreases cognition-based trust. these findings illuminate how trust\narises in n

In [21]:
#remove hyphon only if it is at the end of the line.
#pdf_p = [re.sub(r'(\W[a-z]+)-\n(\W[a-z]+)', r'\1\2', t) for t in pdf_p]
pdf_p = [re.sub(r'-\n', '', t) for t in pdf_p]

In [22]:
#remove text within ()
pdf_p = [re.sub(r"\([^)]*\)", "", t) for t in pdf_p]
# remove text with [] eg citation
pdf_p = [re.sub(r"\[[^)]*\]", "", t) for t in pdf_p]
#Remove numbers
pdf_p = [re.sub('[0-9]+', '', f) for f in pdf_p]


In [23]:
pdf_p = [re.sub('[\n\r]+', ' ', f) for f in pdf_p] # remove newline and linebreak characters, replace with space
pdf_p = [re.sub('[@/%#$,=\+:’“”]', ' ', f) for f in pdf_p] # remove stray non-word characters

In [24]:
#pdf_p = [re.sub(r"\s\([A-Z][a-z]+,\s[A-Z][a-z]?\.[^\)]*,\s\d{4}\)", "", f) for f in pdf_p] # is this supposed to filter out citations?

# save all periods, exclamation marks and question marks, but get rid of unnecessary characters
pdf_p = [re.sub('[\.!?]+ ', 'XYZXYZ', t) for t in pdf_p]
pdf_p = [re.sub('-', 'ZYXXYZ', t) for t in pdf_p]
pdf_p = [re.sub('[\W_]+', ' ', f) for f in pdf_p]
pdf_p = [re.sub('XYZXYZ', '. ', f) for f in pdf_p]
pdf_p = [re.sub('ZYXXYZ', '-', f) for f in pdf_p]
#pdf_p[0]

#Remove placeholder x repetitions
pdf_p = [re.sub(' x+ ', '', f) for f in pdf_p]

# remove single characters
pdf_p = [re.sub(' [a-z] ' ,  ' ', f) for f in pdf_p]
pdf_p = [re.sub(' [a-z]\.', '.', f) for f in pdf_p]
pdf_p = [re.sub(' \.', '.', f) for f in pdf_p]

In [25]:
print(pdf_p[ref_doc])
print('---')
print('---')
print('---')
# print(pdf_p[ref_doc])

 from the head and the heart locating cognition- and affect-based trust in managers professional networks roy yong joo chua paul ingram michael. morris columbia university this article investigates the configuration of cognition- and affect-based trust in managers professional networks examining how these two types of trust are associated with relational content and structure. results indicate that cognition-based trust is positively associated with economic resource task advice and career guidance ties whereas affect-based trust is positively associated with friendship and career guidance ties but negatively associated with economic resource ties. the extent of embeddedness in network through positive ties increases affect-based trust whereas that through negative ties decreases cognition-based trust. these findings illuminate how trust arises in networks and inform network research that invokes trust to explain managerial outcomes. central insight of organizational research is that m

In [26]:
def get_author_names(txt):

    refs = re.sub('[0-9]{4}.*\n','',txt)

    l = re.compile(" [a-z]\.").split(refs)
    l = [ll for ll in l if not bool(re.search('[0-9]+', ll))]
    l = [ll for ll in l if len(ll) < 35]
    l = [re.sub('[\n ,\&\.]', '', ll) for ll in l]
    l = [re.sub('[!(*);&$]', '', l1) for l1 in l]
    authors = [ll for ll in l if len(ll) > 1]

    return(authors)
#for n in names:
#    print(n)

#refs = re.sub('\n.*[0-9].*', '', refs)
#refs = re.sub('\n[^,]+\n', '\n', refs)
#print(refs)

In [27]:
authors = []
for r in references:
    authors += get_author_names(r)
    
author_nam = list(set(list(authors)))

# result is still not perfect, but I have not found any stray 'plain English' so far
print(author_nam[100:250])
print(len(author_nam))

['atkinson', 'castros', 'k:clarendonpresswatson', 'brockbank', 'maggitti', 'woodhouse', 'santos', 'cisco:addison-wesleygrant', 'whennotifin', 'cjovanovichhunt', '-gjeong', 'stawl', 'zafonte', 'newyorkny:freepressmeyer', 'greer', 'youndt', 'day', 'lubatkin', 'nielsen', 'neuberg', 'donovan', 'york:mcgraw-hilltyran', 'waldrip', 'feldman', 'kmec', 'oh', '-pwry', 'lynch', 'westergaard-nielsen', 'kark', 'tansuhaj', 'hakel', 'severj', 'mackey', 'o’donoghue', 'atkinsondmcclelland', 'randall', 'yi', 'kizilos', 'cahill', 'pretz', 'destobbeleir', 'lane', 'luckmann', 'zeniuk', 'mcclelland', 'dulebohn', 'cooper', 'brett', 'badrinarayanan', 'baumgartner', 'jrreilly', 'whiting', 'karnøe', 'petrini', 'rush', 'siegrist', 'gentry', 'comingsin', 'radish', 'pearsall', 'jobber', 'grube', 'mitra', 'pfarrer', 'onebrainoxford', '-ksmith', 'kramers', 'bikson', 'powerin', 'butler', 'fisher', 'fourné', 'bezrukova', 'mwright', 'rebentisch', 'ko-tovsky', 'castellucci', 'icsnewyork:wileytreviño', 'nadelj', 'mehra'

In [28]:
# GET REFERENCE LISTS OF STOPWORDS, I.E. WORDS THAT ARE ESSENTIALLY MEANINGLESS
# Get all the list of authors names

from nltk.corpus import stopwords as wrd

sw = wrd.words('english') + author_nam + country_list + cities_lower

#print(sw)

In [29]:
#include corpus-specific stopwords
sw += ['vol', 'doi', 'article', 'articles', 'reuse', 'sagepub', 'journal', 'journals', 'com', 'quarter', 'quarterly', 'annual', 'proceedings']
sw += ['sage', 'editorial', 'report', 'publications', 'publication', 'publisher', 'jom', 'manuscript', 'manuscripts', 'submission', 'submissions']
sw += ['editor', 'editors', 'orcid', 'id', 'month', 'mailto', 'author', 'authors', 'homepage', 'sciencedirect', 'scopus']
sw += ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
sw += ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

# add a metric ton of generic terms
sw += ['a', 'about', 'above', 'across', 'after', 'afterwards']
sw += ['again', 'against', 'all', 'almost', 'alone', 'along']
sw += ['already', 'also', 'although', 'always', 'am', 'among']
sw += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
sw += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
sw += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
sw += ['because', 'become', 'becomes', 'becoming', 'been']
sw += ['before', 'beforehand', 'behind', 'being', 'below']
sw += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
sw += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
sw += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
sw += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
sw += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
sw += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
sw += ['every', 'everyone', 'everything', 'everywhere', 'except']
sw += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
sw += ['five', 'for', 'former', 'formerly', 'forty', 'found']
sw += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
sw += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
sw += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
sw += ['herself', 'him', 'himself', 'his', 'how', 'however']
sw += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
sw += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
sw += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
sw += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
sw += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
sw += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
sw += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
sw += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
sw += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
sw += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
sw += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
sw += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
sw += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
sw += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
sw += ['some', 'somehow', 'someone', 'something', 'sometime']
sw += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
sw += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
sw += ['then', 'thence', 'there', 'thereafter', 'thereby']
sw += ['therefore', 'therein', 'thereupon', 'these', 'they']
sw += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
sw += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
sw += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
sw += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
sw += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
sw += ['whatever', 'when', 'whence', 'whenever', 'where']
sw += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
sw += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
sw += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
sw += ['within', 'without', 'would', 'yet', 'you', 'your']
sw += ['yours', 'yourself', 'yourselves']

sw = list(set(sw))

In [30]:
# Function to filter out the stopwords and authors names
def filterWords(msg, words):
    m = msg.split(' ')
    words_end = [w + '.' for w in words]
    filtered_words = list(filter(lambda word: word not in words, m))
    filtered_words = [(w if w not in words_end else '.') for w in filtered_words]
    result = ' '.join(filtered_words)
    return(result)

In [31]:
# GET RID OF THE STOPWORDS IN TEXTS
data_proc = [filterWords(f, sw) for f in pdf_p]  

In [32]:
import pickle
# exclude author list from metadata
author_metadata = pickle.load(open("C:/Thesis/Data/save/Master_Data/author_list/author_list.p", "rb"))

In [33]:
# GET RID OF THE author names (extracted from the metadata)
data_proc_1 = [filterWords(f, author_metadata) for f in data_proc] #running

In [34]:
data_proc_1[ref_doc]

' head heart locating cognition- affect-based trust managers professional networks . investigates configuration cognition- affect-based trust managers professional networks examining types trust associated relational content structure. results indicate cognition-based trust positively associated economic resource task advice career guidance ties affect-based trust positively associated friendship career guidance ties negatively associated economic resource ties. extent embeddedness network positive ties increases affect-based trust negative ties decreases cognition-based trust. findings illuminate trust arises networks inform network invokes trust explain managerial outcomes. insight organizational managers rely networks relations resources information support needed career success. businesses evolve flatter structures fluid boundaries professional networks managers develop important. trust willingness make oneself vulnerable person despite uncertainty regarding motives prospective act

In [35]:
data_proc2 = [re.sub("([a-z]+\.)+[a-z]+","", f) for f in data_proc_1]
data_proc2 = [re.sub('\s([?\.!"](?:\s|$))' , r'\1', f) for f in data_proc2]
data_proc2 = [re.sub("^\s+","", f) for f in data_proc2]
data_proc2 = [re.sub("\s+\Z","", f) for f in data_proc2]
data_proc2 = [re.sub("(\.)+",".", f) for f in data_proc2]
data_proc2 = [re.sub(" +"," ", f) for f in data_proc2]
data_proc2 = [re.sub("( +\.)",".", f) for f in data_proc2]
data_proc2 = [re.sub("\.$","", f) for f in data_proc2]

In [36]:
print(data_proc2[ref_doc].split('. '))

sents = []
for d in data_proc2:
    sents += d.split('. ')

['head heart locating cognition- affect-based trust managers professional networks', 'investigates configuration cognition- affect-based trust managers professional networks examining types trust associated relational content structure', 'results indicate cognition-based trust positively associated economic resource task advice career guidance ties affect-based trust positively associated friendship career guidance ties negatively associated economic resource ties', 'extent embeddedness network positive ties increases affect-based trust negative ties decreases cognition-based trust', 'findings illuminate trust arises networks inform network invokes trust explain managerial outcomes', 'insight organizational managers rely networks relations resources information support needed career success', 'businesses evolve flatter structures fluid boundaries professional networks managers develop important', 'trust willingness make oneself vulnerable person despite uncertainty regarding motives pr

In [37]:
print(sents[0:100])

['academy..', 'experts liabilities domain experts boards organizational failure juan almandoz iese andra tilcsik presence domain experts corporate board directors primary professional experience focal firm industry affect organizational outcomes', 'argue conditions significant decision uncertainty higher proportion domain experts board detract effective decision making increase probability organizational failure', 'building exploratory interviews board members ceos derive hypotheses argument context local united states', 'predict theater level decision uncertainty rapid asset growth operation predictable markets stronger relationship proportion banking expert directors probability bank failure', 'longitudinal analyses support prediction accounting overall level professional diversity directors different propensities expert-heavy board', 'discuss implications dimensions board composition conditions professional background directors consequential mechanisms board composition affects orga

In [38]:
# save data
pickle.dump(data_proc2, open( "C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed.p", "wb" ) )

In [39]:
import io

# writing documents one sentence per line to form a larger data set (allowing use of a generator for model training):
with open("C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed_txt.txt", "a",encoding="utf-8") as outfile:
    
    for s in sents:
        outstring = s + '\n'
        outfile.write(outstring)
        
# (additionally you could keep document boundaries intact, 
# e.g. by writing a file per document or marking start and end of articles in the larger file)

In [None]:
#import io
#with open("C:/Thesis/Data/save/Master_Data/MD_3/data_proc_latest/JM_data_proc_txt.txt", "w",encoding="utf-8") as outfile:
#    for i in range(len(data_proc)):
#        outstring = ""
#        outstring += str(data_proc[i])
#        outfile.write(outstring)

# Remove exclude word list from the pre-processed data

In [None]:
import pickle
#load the text data
data_pp = pickle.load(open("C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed.p", "rb"))


In [12]:
len(data_pp)

527

In [4]:
import pickle
#load the text data
exclude_list = pickle.load(open("C:/Users/ICTO-EB/Google Drive/myThesis/evaluation_testdataSet/exclude_word_list.p", "rb"))

In [7]:
type(exclude_list)

list

In [8]:
def filterWords(msg, words):
    m = msg.split(' ')
    words_end = [str(w) + '.' for w in words]
    filtered_words = list(filter(lambda word: word not in words, m))
    filtered_words = [(w if w not in words_end else '.') for w in filtered_words]
    result = ' '.join(filtered_words)
    return(result)

In [9]:
# GET RID OF THE excluded word list 
data_excluded = [filterWords(f, exclude_list) for f in data_pp] #running

In [13]:
len(data_excluded)

527

In [15]:
# save data
pickle.dump(data_excluded, open( "C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed_latest.p", "wb" ) )

In [17]:
sents = []
for d in data_excluded:
    sents += d.split(' ')

In [21]:
import io

# writing documents one sentence per line to form a larger data set (allowing use of a generator for model training):
with open("C:/Thesis/Data/save/Master_Data/pre_processed_data/data_preprocessed_txt.txt", "a",encoding="utf-8") as outfile:
    
    for s in sents:
        outstring = s + '\n'
        outfile.write(outstring)
        