In [111]:
# import libraries

In [112]:
import re
import glob
import pandas as pd
import csv
import ast
import itertools
import nltk

In [113]:
def process(company):
    company = company.replace("&"," ").replace("."," ").replace("-"," ")
    return company

In [114]:
path_companies = 'all/companies.csv'

In [115]:
def read_csv(csv_file):
    companies = []
    with open(csv_file,'r') as f:
        f = f.readlines()
        for line in f:
            line = line.rstrip()
            companies.append(line)
    return companies

In [116]:
companies = read_csv(path_companies)
companies = list(map(lambda x: process(x), companies))

In [117]:
corpus_path = 'all_news.txt'
with open(corpus_path, 'r') as f:
    corpus = ast.literal_eval(f.read())

In [118]:
corpus = list(itertools.chain(*corpus))

In [119]:
corpus

[["ReutersChina's seven day repo rose to a record high of 10.77% in Shanghai, the highest since March 2003, according to Bloomberg*.",
  'Meanwhile, the one-day rate hit a record 12.85%.',
  'And Zerohedge reported that overnight repo hit 25%.',
  'The liquidity squeeze in China first began ahead of the Dragon Boat festival earlier this month.',
  'Spikes in interbank rates are common right before holidays.Â\xa0 But Diana Choyleva at Lombard Street Research said this is symptomatic of a bigger problem.',
  'She said capital flows had "become a more important driver of domestic liquidity conditions in China\'s managed exchange rate system."',
  'In a new note to clients Bank of America\'s Ting Lu wrote: "There are many factors behind the interbank liquidity squeeze that might be cited, but we believe that the ultimate reason is the central bankâ\x80\x99s tough stance as the PBOC can practically provide unlimited liquidity to ease every squeeze if it wishes to.',
  '"Â\xa0 Banks have bee

In [25]:
from nltk.corpus import stopwords


In [26]:
stop_words = stopwords.words('english')

In [27]:
def check_sw(name):
    for stop_word in stop_words:
        if stop_word in name.lower().split():
            return False
    return True

In [28]:
def check_notcompanies(name):
    
    parts = name.split()
    
    political ={'House','Government','Congress','Senator','Office','President','Budget','Policy',
                'Congressman','Congresswoman','Majority','Minority','Politics'}
    for part in parts:
        if part in political:
            return False
    Financial = {'Economy','Economist','Economics','Federal','Management','Trading','Finance','Financial','Money'
              ,'Debt','Balance','Environment',}
    for part in parts:
        if part in Financial:
            return False
    one_word_place = {'USA','US','England','Asia','China','Europe','Africa','Texas','Atlantic','Japan',
                     'Singapore','Koera','Germany','France','North','East','West','South'}
    for part in parts:
        if part in one_word_place:
            return False
    time = {'Monday','Tuesday','Wesnesday','Thursday','Friday','Saturday','Sunday','January',
           'Feburary','March','April','May','June','July','August','September','October','November','December'}
    for part in parts:
        if part in time:
            return False


    return True

In [29]:
def get_companies_name(article):
    
    regulation = '(?:[A-Z][A-Za-z]+ ?)+'
    pattern = re.compile(regulation)
    
    names = map(lambda x: re.findall(pattern, x), article)
    names = filter(None, names)
    names = itertools.chain(*names)
    
    names = filter(check_sw, names)
    names = filter(check_notcompanies, names)
    
    return names

In [30]:
from nltk.tokenize import TweetTokenizer

In [33]:
def get_features(names, article):
    
    features = []
    
    def check_in_sentence(name, sentence):
        return " " + name + " " in sentence or " " + name == sentence[-len(name) + 1:] or name + " " == sentence[:len(name)+1]
    
    for name in names:
        sentences = filter(lambda x: check_in_sentence(name, x), article)
        for sentence in sentences:
            feature = {'name':name}
            
            feature['is_company'] = int(name in companies)
            namesp = name.split()
            last_word = namesp[-1]
            first_word = namesp[0]
            feature['first_word_length'] = len(first_word)
            feature['last_word_length'] = len(last_word)
            
            tokenizer = TweetTokenizer()
            words = tokenizer.tokenize(sentence)
            feature['first_word_index'] = words.index(first_word)
            feature['last_word_index'] = words.index(last_word)
            
            pos_words = nltk.pos_tag(words)
            feature['first_word_pos'] = pos_words[words.index(first_word)][1]
            feature['last_word_pos'] = pos_words[words.index(last_word)][1]
            
            if " " + name + " " in sentence:
                feature['before_word_pos'] = pos_words[words.index(first_word) - 1][1]
                feature['after_word_pos'] = pos_words[words.index(last_word) -1][1]
            elif " " + name == sentence[-len(name)+1:]:
                feature['before_word_pos'] = pos_words[words.index(first_word) -1][1]
                feature['after_word_pos'] = 'NULL'
            else:
                feature['before_word_pos'] = 'NULL'
                feature['after_word_pos'] = pos_words[words.index(last_word) -1][1]
            
            keywords = {'Company','Inc','Group','Corporation','Co','Corp','Capital','Management','Ltd'}
            for keyword in keywords:
                if keyword in namesp:
                    feature['keyword'] = 1
                    break
                else:
                    feature['keyword'] = 0
            features.append(feature)
        
    return features

In [34]:
split1 = int(0.6*len(corpus))
split2 = int(0.8*len(corpus))

training_set = corpus[0:split1]
validating_set = corpus[split1:split2]
testing_set = corpus[split2:]

train = []
for i, article in enumerate(training_set):
    names = get_companies_name(article)
    features = get_features(names, article)
    train.extend(features)
    
    if i%500 == 0:
        print(f'This program has already deal with {i} articles')

This program has already deal with 0 articles
This program has already deal with 500 articles
This program has already deal with 1000 articles
This program has already deal with 1500 articles
This program has already deal with 2000 articles
This program has already deal with 2500 articles
This program has already deal with 3000 articles
This program has already deal with 3500 articles
This program has already deal with 4000 articles
This program has already deal with 4500 articles
This program has already deal with 5000 articles
This program has already deal with 5500 articles
This program has already deal with 6000 articles
This program has already deal with 6500 articles
This program has already deal with 7000 articles
This program has already deal with 7500 articles
This program has already deal with 8000 articles
This program has already deal with 8500 articles
This program has already deal with 9000 articles
This program has already deal with 9500 articles
This program has already

In [40]:
import numpy as np
import pandas as pd

In [41]:
df = pd.DataFrame(train)

In [43]:
import sklearn
from sklearn import metrics
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

In [44]:
df['is_company'].value_counts()

0    259544
1     79054
Name: is_company, dtype: int64

In [52]:
df_positive = df[df['is_company'] == 1]
df_negative = df[df['is_company'] == 0]
re_length = len(df_positive) * 2
df_positive = resample(df_positive, replace = True, n_samples = re_length)
df_negative = resample(df_negative, replace = False, n_samples = re_length)
df1 = pd.concat([df_positive, df_negative], ignore_index = True)

In [56]:
df1['after_word_pos'] = pd.factorize(df1['after_word_pos'])[0]
df1['before_word_pos'] = pd.factorize(df1['before_word_pos'])[0]
df1['first_word_pos'] = pd.factorize(df1['first_word_pos'])[0]
df1['last_word_pos'] = pd.factorize(df1['last_word_pos'])[0]

In [65]:
RFC = RandomForestClassifier(n_estimators = 20)
featuress = [i for i in df1.columns if i not in ('is_company','name')]
RFC.fit(df1[featuress],df1['is_company'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [67]:
validation =[]
for i, article in enumerate(validating_set):
    names = get_companies_name(article)
    features = get_features(names, article)
    validation.extend(features)
    
    if i% 500 == 0: print(f'The program has already processed {i} articles')

The program has already processed 0 articles
The program has already processed 500 articles
The program has already processed 1000 articles
The program has already processed 1500 articles
The program has already processed 2000 articles
The program has already processed 2500 articles
The program has already processed 3000 articles
The program has already processed 3500 articles
The program has already processed 4000 articles
The program has already processed 4500 articles
The program has already processed 5000 articles
The program has already processed 5500 articles
The program has already processed 6000 articles
The program has already processed 6500 articles
The program has already processed 7000 articles


In [69]:
dfv = pd.DataFrame(validation)

In [71]:
dfv['after_word_pos'] = pd.factorize(dfv['after_word_pos'])[0]
dfv['before_word_pos'] = pd.factorize(dfv['before_word_pos'])[0]
dfv['first_word_pos'] = pd.factorize(dfv['first_word_pos'])[0]
dfv['last_word_pos'] = pd.factorize(dfv['last_word_pos'])[0]

In [86]:
def get_metrics(truth, predicted):
    confusion_matrix = metrics.confusion_matrix(truth, predicted)
    accuracy = metrics.accuracy_score(truth, predicted)
    precision = metrics.precision_score(truth, predicted)
    recall = metrics.recall_score(truth, predicted)
    F1_score = metrics.f1_score(truth, predicted)
    
    print(f"confusion_matrix:\n {confusion_matrix}\n")
    print("accuracy:" f'{accuracy}')
    print("precision:" f'{precision}')
    print("recall:" f"{recall}")
    print("f1_score:" f"{F1_score}")
    return 

In [87]:
predicted = RFC.predict(dfv[featuress])
get_metrics(dfv.is_company, predicted)

confusion_matrix:
 [[55379 41133]
 [13576 16514]]

accuracy:0.5678662264419203
precision:0.28646763925269314
recall:0.5488202060485211
f1_score:0.37644323375542815


In [89]:
test = []
for i, article in enumerate(testing_set):
    names = get_companies_name(article)
    features = get_features(names, article)
    test.extend(features)
    
    if i % 500 == 0:
        print(f"The program has already processed with {i} articles")

The program has already processed with 0 articles
The program has already processed with 500 articles
The program has already processed with 1000 articles
The program has already processed with 1500 articles
The program has already processed with 2000 articles
The program has already processed with 2500 articles
The program has already processed with 3000 articles
The program has already processed with 3500 articles
The program has already processed with 4000 articles
The program has already processed with 4500 articles
The program has already processed with 5000 articles
The program has already processed with 5500 articles
The program has already processed with 6000 articles
The program has already processed with 6500 articles
The program has already processed with 7000 articles


In [93]:
dftest = pd.DataFrame(test)
dftest.after_word_pos = pd.factorize(dftest.after_word_pos)[0]
dftest.before_word_pos = pd.factorize(dftest.after_word_pos)[0]
dftest.first_word_pos = pd.factorize(dftest.first_word_pos)[0]
dftest.last_word_pos = pd.factorize(dftest.last_word_pos)[0]

In [95]:
predict = RFC.predict(dftest[featuress])
get_metrics(dftest.is_company, predict)

confusion_matrix:
 [[62096 34417]
 [15709 20106]]

accuracy:0.6211988392479294
precision:0.36876180694385857
recall:0.5613848945972358
f1_score:0.4451282959551905


In [100]:
predt = RFC.predict(df1[featuress])
predv = RFC.predict(dfv[featuress])
predte = RFC.predict(dftest[featuress])

found_companies =[]
found_companies.extend(list(df1.iloc[np.where(predt ==1)].name))
found_companies.extend(list(dfv.iloc[np.where(predv == 1)].name))
found_companies.extend(list(dftest.iloc[np.where(predte == 1)].name))

with open('found_companie.txt','w') as f:
    for company in found_companies:
        f.write(company + '\n')