In [1]:
# import libraries
import os
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

import re
import nltk
%matplotlib inline

In [2]:
# Preprocessing 
def stopwordremoval(text):
    return [word for word in re.split('\s',text) if word.lower() not in stopwords.words('english')]

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in re.split('\s',text)]

def stemmer(text):
    porter_stemmer = PorterStemmer()
    return [porter_stemmer.stem(token) for token in word_tokenize(text)]

def removepunc(s):
    return re.sub(r'[^\w\s]','',s)

def joinlist(listoftext):
    return ' '.join(listoftext)

def tokenindex(text, token):
    tokenized = word_tokenize(text)
    for i in range(len(tokenized)):
        if tokenized[i] == token:
            return i

In [None]:
# Read Training Files
ceo = pd.read_csv('ceo.csv',encoding = 'cp1252',names = ['column 1','column 2'])
companies = pd.read_csv('companies.csv',names = ['text'])
percentages = pd.read_csv('percentage.csv',encoding = 'cp1252',names = ['text'])

In [None]:
# Concatenate ceo names into one column
ceo['column 2'].fillna('blank',inplace= True)
ceo['column 1'].fillna('blank',inplace= True)
ceo['text'] = None
for i in range(len(ceo)):        
    ceo['text'][i] =  ceo['column 1'][i] + ' ' + ceo['column 2'][i]
for i in range(len(ceo)):
    if ceo['column 2'][i] == 'blank':
        ceo['text'][i] = ceo['column 1'][i]
for i in range(len(ceo)):
    if ceo['column 1'][i] == 'blank':
        ceo['text'][i] = ceo['column 2'][i]

ceo.drop(['column 1','column 2'],axis = 1, inplace = True)

In [None]:
# Lemmatize Company names, and identify companies that have names changed dued to lemanitization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
companies['lemma'] = None
for i in range(len(companies)):
    companies['lemma'][i]=[lemmatizer.lemmatize(token) for token in re.split('\s',companies['text'][i])]
    companies['lemma'][i] = ' '.join(companies['lemma'][i])
companies[companies['text']!=companies['lemma']]

In [None]:
# Lemmatize ceo names, and identify ceo that have names changed dued to lemanitization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
ceo['lemma'] = None
for i in range(len(ceo)):
    ceo['lemma'][i]=[lemmatizer.lemmatize(token) for token in re.split('\s',ceo['text'][i])]
    ceo['lemma'][i] = ' '.join(ceo['lemma'][i])
ceo[ceo['text']!=ceo['lemma']]

In [242]:
#Read Articles into list of articles
articles = []

for file in os.listdir('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/2013'):
    articles.append(open(os.path.join('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/2013',file),'rb').read().decode('utf-8','ignore'))
for file in os.listdir('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/2014'):
    articles.append(open(os.path.join('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/2014',file),'rb').read().decode('utf-8','ignore'))
    

In [244]:
# Sent_tokenize: Break Text into sentences
sentences = [sent_tokenize(article) for article in articles]
# Tokenize sentences
tokenized_sentences = []
complete_sentences = []
count = 0
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        tokenized_sentences.append(re.split('\s',sentences[i][j]))
        complete_sentences.append(sentences[i][j])
df_CS = pd.DataFrame(complete_sentences,columns=['text'])

In [None]:
# Build Training data with 150000 sample sentences
df_analyze = df_CS.head(150000)

In [None]:
# Remove Stopwords and Lemmatize Documents or Stem Documents
df_analyze['RemovedStopwords'] = df_analyze['text'].apply(stopwordremoval).apply(joinlist)
df_analyze['Lemmatized'] = df_analyze['RemovedStopwords'].apply(lemmatization).apply(joinlist)

In [None]:
# Create Copies for use in CEO and company name extraction
df_analyze_ceo = df_analyze.copy(deep=True)
df_analyze_companies = df_analyze.copy(deep=True)

# For CEO

In [None]:
# Find all sentences with CEO that have name length >1 (Mark Zuckerberg and not Zuckerberg)
df_analyze_ceo['ceo_labels'] = 0
df_analyze_ceo['names'] = None
for i in range(len(ceo)):
    pat = re.compile(r'\b' + ceo['text'][i]+ r'\b')
    for j in range(len(df_analyze_ceo)):
        match = re.search(pat,df_analyze_ceo['text'][j])
        if (match != None):
                if ((len(word_tokenize(ceo['text'][i])) > 1)):
                    df_analyze_ceo.iloc[j, df_analyze_ceo.columns.get_loc('ceo_labels')] = 1
                    df_analyze_ceo.iloc[j, df_analyze_ceo.columns.get_loc('names')]  = match.group()            

In [None]:
df_analyze_ceo_ML_positive = df_analyze_ceo[df_analyze_ceo['ceo_labels']==1]
df_analyze_ceo_ML_positive.drop(['text','RemovedStopwords'],axis = 1, inplace=True)

In [None]:
#### Define function that find names from the entire corpus

import nltk
from nameparser.parser import HumanName

def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing lone surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []

    return (person_list)

In [None]:
# No need to run this. NAME Extraction: saved in export_list
names = []
for i in len(articles):
    names.append(get_person_names(articles[i]))
names = list(dict.fromkeys(names))
# Covert list of list into flat list
flat_list_names = [item for sublist in names for item in sublist]
# Remove Duplicates
flat_list_names = list(dict.fromkeys(flat_list_names))
# Remove ceo names
not_ceo_names = [x for x in flat_list_names if x not in ceo['text'].tolist()]
# Export list for manipulation in excel
export_list = pd.DataFrame(not_ceo_names,columns=['text'])
export_list.to_csv('export_list.csv')

In [None]:
##### Start here
# Filtered to names that are athletes, politicians, and economists
non_ceo = pd.read_csv('non_ceo.csv')
non_ceo = non_ceo[non_ceo['yes']==1]
non_ceo.drop(['yes'],axis = 1,inplace = True)
# Construct Negative Samples for CEOs
df_analyze_ceo_negative = df_analyze.copy(deep=True)
non_ceo.reset_index(inplace=True)

In [None]:
# Find all sentences with nonCEO that have name length >1
df_analyze_ceo_negative['ceo_labels'] = 1
df_analyze_ceo_negative['names'] = None
for i in range(len(non_ceo)):
    pat = re.compile(r'\b' + non_ceo['text'][i]+ r'\b')
    for j in range(len(df_analyze_ceo_negative)):
        match = re.search(pat,df_analyze_ceo_negative['text'][j])
        if (match != None):
                if ((len(word_tokenize(non_ceo['text'][i])) > 1)):
                    df_analyze_ceo_negative.iloc[j, df_analyze_ceo_negative.columns.get_loc('ceo_labels')] = 0
                    df_analyze_ceo_negative.iloc[j, df_analyze_ceo_negative.columns.get_loc('names')]  = match.group()

In [None]:
# Create Negative Sample Dataframe -> Combine Negative and Positive samples -> Remove punctuations
df_analyze_ceo_ML_negative = df_analyze_ceo_negative[df_analyze_ceo_negative['ceo_labels']==0]
df_analyze_ceo_ML_negative.drop(['text','RemovedStopwords'],axis = 1, inplace=True)
df_analyze_ceo_ML  = pd.concat([df_analyze_ceo_ML_negative,df_analyze_ceo_ML_positive])
df_analyze_ceo_ML.reset_index(inplace=True,drop= True)
df_analyze_ceo_ML['Lemmatized'] = df_analyze_ceo_ML['Lemmatized'].apply(lambda x: removepunc(x))
df_analyze_ceo_ML['names'] = df_analyze_ceo_ML['names'].apply(lambda x: removepunc(x))

In [None]:
def grab_word_before(text,name):
    sentence = word_tokenize(text)
    indexfirstname = tokenindex(text,word_tokenize(name)[0])
    indexlastname = tokenindex(text,word_tokenize(name)[len(word_tokenize(name))-1])
    if indexlastname == len(word_tokenize(text)) - 1 and indexfirstname!= None: #If last name is in end of sentence
        return sentence[indexfirstname - 1]
    elif(indexfirstname != None and indexlastname!= None): #If name is in middle of sentence
        return sentence[indexfirstname - 1]
    else:
        return 'NULL'
def grab_word_after(text,name):
    sentence = word_tokenize(text)
    indexfirstname = tokenindex(text,word_tokenize(name)[0])
    indexlastname = tokenindex(text,word_tokenize(name)[len(word_tokenize(name))-1])
    if indexlastname == (len(sentence) - 1) and indexfirstname != None: #If last name is in end of sentence
        return 'NULL'
    elif indexfirstname == 0 and indexlastname != None and indexlastname != 1: #if first name is in beginning of sentence and there are more to the sentence
        return sentence[indexlastname+1]
    elif(indexfirstname != None and indexlastname!= None): # If name is in middle of sentence
        return sentence[indexlastname+1]

In [None]:
# Extract words before and after the names
df_analyze_ceo_ML['word_before'] =df_analyze_ceo_ML.apply(lambda x : grab_word_before(x['Lemmatized'],x['names']),axis=1)
df_analyze_ceo_ML['word_after'] =df_analyze_ceo_ML.apply(lambda x : grab_word_after(x['Lemmatized'],x['names']),axis=1)

In [None]:
# Throw out unanalyzable data
df_analyze_ceo_ML_feed = df_analyze_ceo_ML[(df_analyze_ceo_ML['word_after']!='NULL')]
df_analyze_ceo_ML_feed.reset_index(drop = True,inplace = True)

In [None]:
# Extract Name Features
df_analyze_ceo_ML_feed['Capitalized'] = False
df_analyze_ceo_ML_feed['lengthofname'] = 0
df_analyze_ceo_ML_feed['lengthoftoken'] = 0
df_analyze_ceo_ML_feed['nameinbeg'] = False
df_analyze_ceo_ML_feed['nameinend'] = False

df_analyze_ceo_ML_feed['Capitalized'] = df_analyze_ceo_ML_feed['names'].apply(lambda x: x.istitle())
df_analyze_ceo_ML_feed['lengthofname'] = df_analyze_ceo_ML_feed['names'].apply(lambda x: len(x))
df_analyze_ceo_ML_feed['lengthoftoken'] = df_analyze_ceo_ML_feed['names'].apply(lambda x: len(word_tokenize(x)))
df_analyze_ceo_ML_feed['nameinbeg'] = df_analyze_ceo_ML_feed.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[0])==0,axis = 1)
df_analyze_ceo_ML_feed['nameinend'] = df_analyze_ceo_ML_feed.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[len(word_tokenize(x['names']))-1]) == len(word_tokenize(x['Lemmatized']))-1,axis = 1)

In [None]:
# Extract Before and After Word Features
df_analyze_ceo_ML_feed['beforewordlength'] = 0
df_analyze_ceo_ML_feed['beforewordcapitalized'] = False
df_analyze_ceo_ML_feed['beforewordcontainnumbers'] = False
df_analyze_ceo_ML_feed['beforewordcontainceoindicator'] = False
df_analyze_ceo_ML_feed['afterwordlength'] = 0
df_analyze_ceo_ML_feed['afterwordcapitalized'] = False
df_analyze_ceo_ML_feed['afterwordcontainnumbers'] =False
df_analyze_ceo_ML_feed['afterwordcontainceoindicator'] = False
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
def hasceoindicator(inputString):
    ceoindicators = ['ceo','chair','chairman','chairwoman','executive','investor','founder','chief']
    if inputString.lower() in ceoindicators:
        return True
    else:
        return False
df_analyze_ceo_ML_feed['beforewordlength'] = df_analyze_ceo_ML_feed['word_before'].apply(lambda x: len(x))
df_analyze_ceo_ML_feed['beforewordcapitalized'] = df_analyze_ceo_ML_feed['word_before'].apply(lambda x: x.istitle())
df_analyze_ceo_ML_feed['beforewordcontainnumbers'] = df_analyze_ceo_ML_feed['word_before'].apply(lambda x: hasNumbers(x))
df_analyze_ceo_ML_feed['beforewordcontainceoindicator'] = df_analyze_ceo_ML_feed['word_before'].apply(lambda x: hasceoindicator(x))
df_analyze_ceo_ML_feed['afterwordlength'] = df_analyze_ceo_ML_feed['word_after'].apply(lambda x: len(x))
df_analyze_ceo_ML_feed['afterwordcapitalized'] = df_analyze_ceo_ML_feed['word_after'].apply(lambda x: x.istitle())
df_analyze_ceo_ML_feed['afterwordcontainnumbers'] =df_analyze_ceo_ML_feed['word_after'].apply(lambda x: hasNumbers(x))
df_analyze_ceo_ML_feed['afterwordcontainceoindicator'] = df_analyze_ceo_ML_feed['word_after'].apply(lambda x: hasceoindicator(x))

In [None]:
# Extract Sentence Feature
def havebusinesswords(inputString):
    businesswordindicators = ['yoy','growth','strategy','stock','profit','loss','company','Corporation']
    words_re = re.compile("|".join(businesswordindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
def sentencehasceoindicator(inputString):
    ceoindicators = ['ceo','executive','investor','founder']
    words_re = re.compile("|".join(ceoindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
df_analyze_ceo_ML_feed['senetencecontainceo'] = df_analyze_ceo_ML_feed['Lemmatized'].apply(lambda x: sentencehasceoindicator(x))
df_analyze_ceo_ML_feed['senetencecontainbusinesswords'] = df_analyze_ceo_ML_feed['Lemmatized'].apply(lambda x: havebusinesswords(x))


In [None]:
# Drop data that are not analyzable
df_analyze_ceo_ML_feed = df_analyze_ceo_ML_feed[df_analyze_ceo_ML_feed['word_after'].isnull() == False]
df_analyze_ceo_ML_feed.reset_index(inplace= True, drop= True)

In [None]:
# Stem words before and after
df_analyze_ceo_ML_feed['word_before'] = df_analyze_ceo_ML_feed['word_before'].apply(stemmer).apply(joinlist)
df_analyze_ceo_ML_feed['word_after'] = df_analyze_ceo_ML_feed['word_after'].apply(stemmer).apply(joinlist)

In [None]:
# feed into machine learning algorithm
from sklearn.model_selection import train_test_split
X = df_analyze_ceo_ML_feed
#X = df_analyze_ceo_ML_feed.drop(['Lemmatized','ceo_labels','names','word_before','word_after'],axis = 1)
y = df_analyze_ceo_ML_feed['ceo_labels']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

In [None]:
#Onehotencode before and after words for training dataset
X_train.reset_index(inplace = True, drop = True)
y_train.reset_index(inplace=True,drop=True)

from sklearn.preprocessing import OneHotEncoder
encbefore = OneHotEncoder(categories ='auto',handle_unknown='ignore')
beforeencode = encbefore.fit_transform(X_train.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore = pd.DataFrame(beforeencode,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,dfOneHotBefore],axis = 1)

encafter = OneHotEncoder(categories='auto',handle_unknown='ignore')
afterencode = encafter.fit_transform(X_train.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter = pd.DataFrame(afterencode,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,dfOneHotAfter],axis = 1)

X_train = X_train.drop(['Lemmatized','ceo_labels','names','word_before','word_after'],axis = 1)

In [None]:
clf = MultinomialNB()
clf.fit(X_train,y_train)

In [None]:
#Onehotencode before and after words for testing dataset
X_test.reset_index(inplace = True, drop = True)
y_test.reset_index(inplace=True,drop=True)

from sklearn.preprocessing import OneHotEncoder
beforeencode = encbefore.transform(X_test.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore = pd.DataFrame(beforeencode,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore.reset_index(drop=True,inplace=True)
X_test = pd.concat([X_test,dfOneHotBefore],axis = 1)

afterencode = encafter.transform(X_test.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter = pd.DataFrame(afterencode,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter.reset_index(drop=True,inplace=True)
X_test = pd.concat([X_test,dfOneHotAfter],axis = 1)

X_test = X_test.drop(['Lemmatized','ceo_labels','names','word_before','word_after'],axis = 1)

In [None]:
# # Use Naive Bayes
# Training score
y_train_pred = clf.predict(X_train)
print("Training accuracy: {}".format(accuracy_score(y_train,y_train_pred)))

# Testing score
y_test_pred = clf.predict(X_test)
print("Test accuracy: {}".format(accuracy_score(y_test,y_test_pred)))

In [None]:
# Test with Logistic Regression
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(class_weight='balanced')
logmodel.fit(X_train,y_train)
log_predictions = logmodel.predict(X_test)
log_train_predictions = clf.predict(X_train)
print("Training accuracy: {}".format(accuracy_score(y_train,log_train_predictions)))
print('Logistic Reg accuracy: {}'.format(accuracy_score(y_test,log_predictions)))


In [None]:
# Get all possible names from complete sentences extracted from corpus (super long to run)
df_CS['names'] = df_CS['text'].apply(lambda x: get_human_names(x))

In [None]:
# Import data ran above
df_CS = pd.read_csv('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/ALLPOSSIBLENAMES.csv',lineterminator='\n',index_col=0)

In [None]:
# Extract all sentence - name pairs
s = df_CS[df_CS['names'].map(lambda d: len(d)) > 0]
s.reset_index(drop = True,inplace =True)
res = s.set_index(['text'])['names'].apply(pd.Series).stack()
res = res.reset_index()
res.drop(['level_1'],axis = 1,inplace = True)
res.columns = ['text','names']

In [None]:
# Remove Stopwords and Lemmatize Documents and remove punctuation
res['RemovedStopwords'] = res['text'].apply(stopwordremoval).apply(joinlist)
res['Lemmatized'] = res['RemovedStopwords'].apply(lemmatization).apply(joinlist)
res['Lemmatized'] = res['Lemmatized'].apply(lambda x: removepunc(x))
res['names'] = res['names'].apply(lambda x: removepunc(x))

In [None]:
# Grab words before and after the names
res['word_before'] = res.apply(lambda x : grab_word_before(x['Lemmatized'],x['names']),axis=1)
res['word_after'] = res.apply(lambda x : grab_word_after(x['Lemmatized'],x['names']),axis=1)

In [None]:
# Make copy of last dataframe, reimport into df_CS_feed for manipulation
df_CS_feed = res.copy(deep=True)

In [None]:
# Extract Valid samples for analysis
df_CS_feed = df_CS_feed[df_CS_feed['word_after'].isnull() == False]
df_CS_feed.reset_index(drop = True,inplace = True)

In [None]:
# Extract Name Features
df_CS_feed['Capitalized'] = False
df_CS_feed['lengthofname'] = 0
df_CS_feed['lengthoftoken'] = 0
df_CS_feed['nameinbeg'] = False
df_CS_feed['nameinend'] = False

df_CS_feed['Capitalized'] = df_CS_feed['names'].apply(lambda x: x.istitle())
df_CS_feed['lengthofname'] = df_CS_feed['names'].apply(lambda x: len(x))
df_CS_feed['lengthoftoken'] = df_CS_feed['names'].apply(lambda x: len(word_tokenize(x)))
df_CS_feed['nameinbeg'] = df_CS_feed.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[0])==0,axis = 1)
df_CS_feed['nameinend'] = df_CS_feed.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[len(word_tokenize(x['names']))-1]) == len(word_tokenize(x['Lemmatized']))-1,axis = 1)

# Extract Before and After Word Features
df_CS_feed['beforewordlength'] = 0
df_CS_feed['beforewordcapitalized'] = False
df_CS_feed['beforewordcontainnumbers'] = False
df_CS_feed['beforewordcontainceoindicator'] = False
df_CS_feed['afterwordlength'] = 0
df_CS_feed['afterwordcapitalized'] = False
df_CS_feed['afterwordcontainnumbers'] =False
df_CS_feed['afterwordcontainceoindicator'] = False
df_CS_feed['beforewordlength'] = df_CS_feed['word_before'].apply(lambda x: len(x))
df_CS_feed['beforewordcapitalized'] = df_CS_feed['word_before'].apply(lambda x: x.istitle())
df_CS_feed['beforewordcontainnumbers'] = df_CS_feed['word_before'].apply(lambda x: hasNumbers(x))
df_CS_feed['beforewordcontainceoindicator'] = df_CS_feed['word_before'].apply(lambda x: hasceoindicator(x))
df_CS_feed['afterwordlength'] = df_CS_feed['word_after'].apply(lambda x: len(x))
df_CS_feed['afterwordcapitalized'] = df_CS_feed['word_after'].apply(lambda x: x.istitle())
df_CS_feed['afterwordcontainnumbers'] =df_CS_feed['word_after'].apply(lambda x: hasNumbers(x))
df_CS_feed['afterwordcontainceoindicator'] = df_CS_feed['word_after'].apply(lambda x: hasceoindicator(x))

In [None]:
# Extract Sentence Feature
def havebusinesswords(inputString):
    businesswordindicators = ['yoy','growth','strategy','stock','profit','loss','company','Corporation']
    words_re = re.compile("|".join(businesswordindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
def sentencehasceoindicator(inputString):
    ceoindicators = ['ceo','executive','investor','founder']
    words_re = re.compile("|".join(ceoindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
df_CS_feed['senetencecontainceo'] = df_CS_feed['Lemmatized'].apply(lambda x: sentencehasceoindicator(x))
df_CS_feed['senetencecontainbusinesswords'] = df_CS_feed['Lemmatized'].apply(lambda x: havebusinesswords(x))


In [None]:
# Stem words before and after to feed in getdummy
df_CS_feed['word_before'] = df_CS_feed['word_before'].apply(stemmer).apply(joinlist)
df_CS_feed['word_after'] = df_CS_feed['word_after'].apply(stemmer).apply(joinlist)

In [None]:
#Onehotencode before and after words for testing dataset
from sklearn.preprocessing import OneHotEncoder
beforeencode = encbefore.transform(df_CS_feed.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore = pd.DataFrame(beforeencode,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore.reset_index(drop=True,inplace=True)
df_CS_feed = pd.concat([df_CS_feed,dfOneHotBefore],axis = 1)

afterencode = encafter.transform(df_CS_feed.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter = pd.DataFrame(afterencode,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter.reset_index(drop=True,inplace=True)
df_CS_feed = pd.concat([df_CS_feed,dfOneHotAfter],axis = 1)

In [None]:
# feed into machine learning algorithm
from sklearn.model_selection import train_test_split

X = df_CS_feed.drop(['text','RemovedStopwords','Lemmatized','names','word_before','word_after'],axis = 1)

y_predict = clf.predict(X)

In [None]:
# Make predictions
y_predict_df = pd.DataFrame(y_predict,columns =['ceo_labels'])
df_final_ceo = pd.concat([df_CS_feed,y_predict_df],axis=1)

In [None]:
# Export ceo extraction files
df_final_ceo[df_final_ceo['ceo_labels']==1][['text','names','ceo_labels']].to_csv('CEO_Extraction2.csv')

# For Companies

In [None]:
# Find all sentences with Companies           
df_analyze_companies['companies_labels'] = 0
df_analyze_companies['names'] = None
for i in range(len(companies)):
    pat = re.compile(r'\b' + companies['text'][i]+ r'\b')
    for j in range(len(df_analyze_companies)):
        match = re.search(pat,df_analyze_companies['text'][j])
        if (match != None):
                if ((len(word_tokenize(companies['text'][i])) > 1)):
                    df_analyze_companies.iloc[j, df_analyze_companies.columns.get_loc('companies_labels')] = 1
                    df_analyze_companies.iloc[j, df_analyze_companies.columns.get_loc('names')]  = match.group()

In [None]:
# Put it into new dataframe and keep only lemmatized text
df_analyze_companies_ML_positive = df_analyze_companies[df_analyze_companies['companies_labels']==1]
df_analyze_companies_ML_positive.drop(['text','RemovedStopwords'],axis = 1, inplace=True)

In [None]:
# Import non_companies names (Organization that have the following words: University, Province, State, Foundation, Tower, Federation, zoo, School, Association, World, Institute, Institution)
non_companies = pd.read_csv('non_companies.csv')
non_companies = non_companies[non_companies['yes']==1]
non_companies.drop(['yes'],axis = 1,inplace = True)
non_companies.reset_index(inplace=True,drop=True)

In [None]:
df_analyze_companies_ML_negative = df_analyze.copy(deep=True)

In [None]:
df_analyze_companies_ML_negative['companies_labels'] = 1
df_analyze_companies_ML_negative['names'] = None
for i in range(len(non_companies)):
    pat = re.compile(r'\b' + non_companies['text'][i]+ r'\b')
    for j in range(len(df_analyze_companies_ML_negative)):
        match = re.search(pat,df_analyze_companies_ML_negative['text'][j])
        if (match != None):
                if ((len(word_tokenize(non_companies['text'][i])) > 1)):
                    df_analyze_companies_ML_negative.iloc[j, df_analyze_companies_ML_negative.columns.get_loc('companies_labels')] = 0
                    df_analyze_companies_ML_negative.iloc[j, df_analyze_companies_ML_negative.columns.get_loc('names')]  = match.group()

In [None]:
df_analyze_companies_ML_negative = df_analyze_companies_ML_negative[df_analyze_companies_ML_negative['companies_labels']==0]
df_analyze_companies_ML_negative.drop(['text','RemovedStopwords'],axis = 1, inplace=True)

In [None]:
# Append additional negative samples using positive samples in ceo
df_analyze_ceo_companies_additionalnegative = df_analyze_ceo_ML_positive[['Lemmatized','ceo_labels','names']].copy(deep=True)
df_analyze_ceo_companies_additionalnegative.columns = ['Lemmatized','companies_labels','names']
df_analyze_ceo_companies_additionalnegative['companies_labels'] = 0 
df_analyze_companies_ML_negative = df_analyze_companies_ML_negative.append(df_analyze_ceo_companies_additionalnegative) 

In [None]:
df_analyze_companies_ML  = pd.concat([df_analyze_companies_ML_negative,df_analyze_companies_ML_positive])
df_analyze_companies_ML.reset_index(inplace=True,drop= True)

In [None]:
# Remove punctuation for easier analyzation
df_analyze_companies_ML['Lemmatized'] = df_analyze_companies_ML['Lemmatized'].apply(lambda x: removepunc(x))
df_analyze_companies_ML['names'] = df_analyze_companies_ML['names'].apply(lambda x: removepunc(x))

In [None]:
df_analyze_companies_ML['word_before'] = df_analyze_companies_ML.apply(lambda x : grab_word_before(x['Lemmatized'],x['names']),axis=1)
df_analyze_companies_ML['word_after'] = df_analyze_companies_ML.apply(lambda x : grab_word_after(x['Lemmatized'],x['names']),axis=1)

In [None]:
df_analyze_companies_ML['Capitalized'] = False
df_analyze_companies_ML['lengthofname'] = 0
df_analyze_companies_ML['lengthoftoken'] = 0
df_analyze_companies_ML['nameinbeg'] = False
df_analyze_companies_ML['nameinend'] = False
df_analyze_companies_ML['wordcontaincompanyindicator'] = False
def hascompanyindicator(inputString):
    companyindicators = ['Inc','Corp','Corporation','Bank','LLC','Group','Ltd','Ventures','Capital','Partners','Company','Holdings']
    words_re = re.compile("|".join(companyindicators))
    if words_re.search(inputString)!=None:
        return True
    else:
        return False
df_analyze_companies_ML['Capitalized'] = df_analyze_companies_ML['names'].apply(lambda x: x.istitle())
df_analyze_companies_ML['lengthofname'] = df_analyze_companies_ML['names'].apply(lambda x: len(x))
df_analyze_companies_ML['lengthoftoken'] = df_analyze_companies_ML['names'].apply(lambda x: len(word_tokenize(x)))
df_analyze_companies_ML['nameinbeg'] = df_analyze_companies_ML.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[0])==0,axis = 1)
df_analyze_companies_ML['nameinend'] = df_analyze_companies_ML.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[len(word_tokenize(x['names']))-1]) == len(word_tokenize(x['Lemmatized']))-1,axis = 1)
df_analyze_companies_ML['wordcontaincompanyindicator'] = df_analyze_companies_ML['names'].apply(lambda x: hascompanyindicator(x))

In [None]:
# Filter to analyzable data
df_analyze_companies_ML = df_analyze_companies_ML[df_analyze_companies_ML['word_after'].isnull()==False]

In [None]:
df_analyze_companies_ML['beforewordlength'] = 0
df_analyze_companies_ML['beforewordcapitalized'] = False
df_analyze_companies_ML['beforewordcontainnumbers'] = False
df_analyze_companies_ML['afterwordlength'] = 0
df_analyze_companies_ML['afterwordcapitalized'] = False
df_analyze_companies_ML['afterwordcontainnumbers'] =False
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
df_analyze_companies_ML['beforewordlength'] = df_analyze_companies_ML['word_before'].apply(lambda x: len(x))
df_analyze_companies_ML['beforewordcapitalized'] = df_analyze_companies_ML['word_before'].apply(lambda x: x.istitle())
df_analyze_companies_ML['beforewordcontainnumbers'] = df_analyze_companies_ML['word_before'].apply(lambda x: hasNumbers(x))
df_analyze_companies_ML['afterwordlength'] = df_analyze_companies_ML['word_after'].apply(lambda x: len(x))
df_analyze_companies_ML['afterwordcapitalized'] = df_analyze_companies_ML['word_after'].apply(lambda x: x.istitle())
df_analyze_companies_ML['afterwordcontainnumbers'] = df_analyze_companies_ML['word_after'].apply(lambda x: hasNumbers(x))


In [None]:
# Extract Sentence Feature
def havebusinesswords(inputString):
    businesswordindicators = ['yoy','growth','strategy','stock','profit','loss','company','Corporation']
    words_re = re.compile("|".join(businesswordindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
def sentencehasceoindicator(inputString):
    ceoindicators = ['ceo','executive','investor','founder']
    words_re = re.compile("|".join(ceoindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
    
df_analyze_companies_ML['senetencecontainceo'] = df_analyze_companies_ML['Lemmatized'].apply(lambda x: sentencehasceoindicator(x))
df_analyze_companies_ML['senetencecontainbusinesswords'] = df_analyze_companies_ML['Lemmatized'].apply(lambda x: havebusinesswords(x))


In [None]:
# Stem words before and after
df_analyze_companies_ML['word_before'] = df_analyze_companies_ML['word_before'].apply(stemmer).apply(joinlist)
df_analyze_companies_ML['word_after'] = df_analyze_companies_ML['word_after'].apply(stemmer).apply(joinlist)

In [None]:
X.to_csv('X_companies.csv')

In [64]:
df_analyze_companies_ML = pd.read_csv('df_analyze_companies_ML.csv',index_col=0)
df_analyze_companies_ML = df_analyze_companies_ML[df_analyze_companies_ML['word_after'].isnull()==False]

In [65]:
# feed into machine learning algorithm
from sklearn.model_selection import train_test_split
X = df_analyze_companies_ML
y = df_analyze_companies_ML['companies_labels']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

In [66]:
#Onehotencode before and after words for training dataset
X_train.reset_index(inplace = True, drop = True)
y_train.reset_index(inplace=True,drop=True)

from sklearn.preprocessing import OneHotEncoder
encbefore = OneHotEncoder(categories ='auto',handle_unknown='ignore')
beforeencode = encbefore.fit_transform(X_train.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore = pd.DataFrame(beforeencode,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,dfOneHotBefore],axis = 1)

encafter = OneHotEncoder(categories='auto',handle_unknown='ignore')
afterencode = encafter.fit_transform(X_train.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter = pd.DataFrame(afterencode,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter.reset_index(drop=True,inplace=True)
X_train = pd.concat([X_train,dfOneHotAfter],axis = 1)

X_train = X_train.drop(['Lemmatized','companies_labels','names','word_before','word_after'],axis = 1)

In [67]:
clf = MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [68]:
#Onehotencode before and after words for testing dataset
X_test.reset_index(inplace = True, drop = True)
y_test.reset_index(inplace=True,drop=True)

from sklearn.preprocessing import OneHotEncoder
beforeencode = encbefore.transform(X_test.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore = pd.DataFrame(beforeencode,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore.reset_index(drop=True,inplace=True)
X_test = pd.concat([X_test,dfOneHotBefore],axis = 1)

afterencode = encafter.transform(X_test.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter = pd.DataFrame(afterencode,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter.reset_index(drop=True,inplace=True)
X_test = pd.concat([X_test,dfOneHotAfter],axis = 1)

X_test = X_test.drop(['Lemmatized','companies_labels','names','word_before','word_after'],axis = 1)

In [69]:
# # Use Naive Bayes
# Training score
y_train_pred = clf.predict(X_train)
print("Training accuracy: {}".format(accuracy_score(y_train,y_train_pred)))

# Testing score
y_test_pred = clf.predict(X_test)
print("Test accuracy: {}".format(accuracy_score(y_test,y_test_pred)))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

Training accuracy: 0.9102085923470092
Test accuracy: 0.8262971318079279
[[1495  180]
 [ 359 1069]]


In [70]:
# Kernel Died from here: read file saved from before
df_CS_feed = pd.read_csv('/Users/eric/Documents/Northwestern University/McCormick/2018-2019:2/IEMS 308/Assignment 3/df_CS_feed',delimiter = ',',lineterminator='\n',index_col=0)

In [72]:
# Apply Algorithm on all sentence_name pairs
df_CS_feed_companies = df_CS_feed[df_CS_feed.columns[[0,1,2,3,4,5]]]

In [73]:
# Apply name features
df_CS_feed_companies['Capitalized'] = False
df_CS_feed_companies['lengthofname'] = 0
df_CS_feed_companies['lengthoftoken'] = 0
df_CS_feed_companies['nameinbeg'] = False
df_CS_feed_companies['nameinend'] = False
df_CS_feed_companies['wordcontaincompanyindicator'] = False
def hascompanyindicator(inputString):
    companyindicators = ['Inc','Corp','Corporation','Bank','LLC','Group','Ltd','Ventures','Education','Capital','Partners','Company','Holdings']
    words_re = re.compile("|".join(companyindicators))
    if words_re.search(inputString)!=None:
        return True
    else:
        return False
df_CS_feed_companies['Capitalized'] = df_CS_feed_companies['names'].apply(lambda x: x.istitle())
df_CS_feed_companies['lengthofname'] = df_CS_feed_companies['names'].apply(lambda x: len(x))
df_CS_feed_companies['lengthoftoken'] = df_CS_feed_companies['names'].apply(lambda x: len(word_tokenize(x)))
df_CS_feed_companies['nameinbeg'] = df_CS_feed_companies.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[0])==0,axis = 1)
df_CS_feed_companies['nameinend'] = df_CS_feed_companies.apply(lambda x: tokenindex(x['Lemmatized'], word_tokenize(x['names'])[len(word_tokenize(x['names']))-1]) == len(word_tokenize(x['Lemmatized']))-1,axis = 1)
df_CS_feed_companies['wordcontaincompanyindicator'] = df_CS_feed_companies['names'].apply(lambda x: hascompanyindicator(x))

In [74]:
# Filter to analyzable data
df_CS_feed_companies = df_CS_feed_companies[df_CS_feed_companies['word_after'].isnull()==False]

# Extract before and after word features
df_CS_feed_companies['beforewordlength'] = 0
df_CS_feed_companies['beforewordcapitalized'] = False
df_CS_feed_companies['beforewordcontainnumbers'] = False
df_CS_feed_companies['afterwordlength'] = 0
df_CS_feed_companies['afterwordcapitalized'] = False
df_CS_feed_companies['afterwordcontainnumbers'] =False
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
df_CS_feed_companies['beforewordlength'] = df_CS_feed_companies['word_before'].apply(lambda x: len(x))
df_CS_feed_companies['beforewordcapitalized'] = df_CS_feed_companies['word_before'].apply(lambda x: x.istitle())
df_CS_feed_companies['beforewordcontainnumbers'] = df_CS_feed_companies['word_before'].apply(lambda x: hasNumbers(x))
df_CS_feed_companies['afterwordlength'] = df_CS_feed_companies['word_after'].apply(lambda x: len(x))
df_CS_feed_companies['afterwordcapitalized'] = df_CS_feed_companies['word_after'].apply(lambda x: x.istitle())
df_CS_feed_companies['afterwordcontainnumbers'] = df_CS_feed_companies['word_after'].apply(lambda x: hasNumbers(x))


In [75]:
# Extract Sentence Feature
def havebusinesswords(inputString):
    businesswordindicators = ['yoy','growth','strategy','stock','profit','loss','company','Corporation']
    words_re = re.compile("|".join(businesswordindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
def sentencehasceoindicator(inputString):
    ceoindicators = ['ceo','executive','investor','founder']
    words_re = re.compile("|".join(ceoindicators))
    if words_re.search(inputString.lower()):
        return True
    else:
        return False
    
df_CS_feed_companies['senetencecontainceo'] = df_CS_feed_companies['Lemmatized'].apply(lambda x: sentencehasceoindicator(x))
df_CS_feed_companies['senetencecontainbusinesswords'] = df_CS_feed_companies['Lemmatized'].apply(lambda x: havebusinesswords(x))


In [76]:
# Stem words before and after
df_CS_feed_companies['word_before'] = df_CS_feed_companies['word_before'].apply(stemmer).apply(joinlist)
df_CS_feed_companies['word_after'] = df_CS_feed_companies['word_after'].apply(stemmer).apply(joinlist)

In [78]:
df_CS_feed_companies.reset_index(inplace=True,drop=True)

In [80]:
#Onehotencode before and after words for testing dataset
from sklearn.preprocessing import OneHotEncoder
beforeencode2 = encbefore.transform(df_CS_feed_companies.word_before.values.reshape(-1,1)).toarray()
dfOneHotBefore2 = pd.DataFrame(beforeencode2,columns = ['before_'+str(int(i)) for i in range (beforeencode.shape[1])])
dfOneHotBefore2.reset_index(drop=True,inplace=True)
df_CS_feed_companies = pd.concat([df_CS_feed_companies,dfOneHotBefore2],axis = 1)

In [82]:
afterencode2 = encafter.transform(df_CS_feed_companies.word_after.values.reshape(-1,1)).toarray()
dfOneHotAfter2 = pd.DataFrame(afterencode2,columns = ['after_'+str(int(i)) for i in range (afterencode.shape[1])])
dfOneHotAfter2.reset_index(drop=True,inplace=True)
df_CS_feed_companies = pd.concat([df_CS_feed_companies,dfOneHotAfter2],axis = 1)

In [58]:
# For double check
df_CS_feed_companies = df_CS_feed_companies[['text', 'names', 'RemovedStopwords', 'Lemmatized', 'word_before',
       'word_after', 'Capitalized', 'lengthofname', 'lengthoftoken',
       'nameinbeg', 'nameinend', 'wordcontaincompanyindicator',
       'beforewordlength', 'beforewordcapitalized', 'beforewordcontainnumbers',
       'afterwordlength', 'afterwordcapitalized', 'afterwordcontainnumbers',
       'senetencecontainceo', 'senetencecontainbusinesswords']]

In [94]:
# Feed data to model
X = df_CS_feed_companies.drop(['text','RemovedStopwords','Lemmatized','names','word_before','word_after'],axis = 1)
y_predict = clf.predict(X)
y_predict_df = pd.DataFrame(y_predict,columns =['companies_labels'])
df_final_companies = pd.concat([df_CS_feed_companies,y_predict_df],axis=1)

In [102]:
df_final_companies[df_final_companies['companies_labels']==1][['text','names','companies_labels']].to_csv('final_companies.csv')

# For Percentages

In [278]:
# define regex tokenizer
def findpercentage(text):
    pat = re.compile(r'([\d\w\-.])+(\%|\s\%|\s\b[Pp]ercent\b|\s\b[Pp]ercentage\spoint\b|\s\b[Pp]ercentage\spoints\b|\s\b[Pp]ercentage\b|\s\b[Pp]ercentile\spoint\b|\s\b[Pp]ercentile\spoints\b)')
    listofpercentage = []
    if re.finditer !=None: 
        for match in re.finditer(pat,text):
            listofpercentage.append(match.group(0))
    return listofpercentage
# tokenize text - remember to convert text to lower case
df_CS['percentages'] = df_CS['text'].apply(lambda x: findpercentage(x))
# Extract sentence-percentage pair
s2 = df_CS[df_CS['percentages'].map(lambda d: len(d)) > 0]
s2.reset_index(drop = True,inplace =True)
res2 = s2.set_index(['text'])['percentages'].apply(pd.Series).stack()
res2 = res2.reset_index()
res2.drop(['level_1'],axis = 1,inplace = True)
res2.columns = ['text','percentages']

In [280]:
res2.to_csv('Percentages_Extraction.csv')