# Readme document and Code to extract entities from news lines in a excel file

**Input specifications**:

1) The input file should be a .xlsx file with the same format as the training file

**Output file specifications**:

1) The output file will be a csv file containing the following columns:
    
    *Text*:Contains the text information from the input cell
    *Company name*: Contains the company name extracted from the text
    *Investor name*: Contains the investor name extracted from the text
    *Funding amount*: Contains the funding amount details extracted from the text
    *Funded year*: Contains the funded year info extracted from the text
    *Funding type*: Contains the funding type information extracted from the text
    
**Model logic and limitations**:

*Logic*

    The current model uses Spacy's out of the box named entity recognition (NER) tool.
    The logic used to identify investor name and company name is based on just the root verb.
    Funding year and funding amount are directly taken based on the entity names.
    Funding type is based on a matching condition.
    
*Limitations*

    Since this model is based on just the standard Spacy NER tool, it wouldn't be fine tuned for our case
    The logics to identify investor name and company name is just based on one logic

**Suggestions**:

    Building a supervised model can give us better results. This would require a good amount of labeled training data
    Logics to identify investor and company name can be made better to take care of a lot of cases
      

## Defining the functions

In [1]:
## Reading the data based on the directory and filename

def read_data(wd,filename):
    data_excel = pd.read_excel(wd + "\\" + filename,header=None)
    return data_excel

##Converting data to list

def pd_to_list(data):
    data_list = data.values.tolist()
    return data_list

##Removing punctuations

def remove_punctuations(string):
    punctuations = '''!-;:'"\,?@#%^&*_~'''
    no_punct = ""
    for char in string:
        if char not in punctuations:
            no_punct = no_punct + char
        else:
            char = ' '
            no_punct = no_punct + char                
    return no_punct

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]   
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

## Extract entities based on logics used

def extract_entities(data_list):
    
    entities_df = pd.DataFrame(columns=['Text', 'Root_Verb','Org_before', 'Org_after','Invested_money','Funded_year','Funding_type'],index = range(len(data_list)))
    nlp = en_core_web_sm.load()

    ruler = EntityRuler(nlp)
    patterns = [{"label": "MONEY", "pattern": "billion"},{"label": "MONEY", "pattern": "Billion"},{"label": "MONEY", "pattern": "Billion+"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    for i,text in enumerate(data_list):

        Org_before = []
        Org_after = []
        invested_money = []
        funded_year = []
        funding_type = []

        string = text[0]
        string_punct_removed = remove_punctuations(string)
        doc = nlp(string_punct_removed)
        bigram_list = generate_ngrams(string_punct_removed,2)   

        for tok in doc:
            root_id = 0
            root_verb = 'None'
            if (tok.dep_ == 'ROOT' and tok.pos_ == 'VERB'):
                root_id = tok.i
                root_verb = tok.lemma_
                entities_df.set_value(i,'Root_Verb',root_verb)
                break    

        for bigram in bigram_list:
            if (bigram == "series a" or bigram == "series b" or bigram == "series c" or bigram == "series d" or bigram == "series e" or bigram == "public offering" or bigram == "angel round"):
                funding_type.append(bigram)

        for entities in doc.ents:
            if (root_id!=0):
                if (entities.label_ == 'ORG' and entities.end <= root_id):
                    Org_before.append(entities.text)
                elif (entities.label_ == 'ORG' and entities.end >= root_id):
                    Org_after.append(entities.text)
            if (entities.label_ == 'MONEY'):
                invested_money.append(entities.text)
            elif (entities.label_ == 'DATE'):
                funded_year.append(entities.text)

        entities_df.set_value(i, 'Text', string)
        entities_df.set_value(i, 'Org_before', ','.join(Org_before))
        entities_df.set_value(i, 'Org_after',','.join(Org_after))
        entities_df.set_value(i, 'Invested_money',','.join(invested_money))
        entities_df.set_value(i, 'Funded_year',','.join(funded_year))
        entities_df.set_value(i, 'Funding_type', ','.join(funding_type))
        
    return entities_df

## Creating the output datarame
def generate_op_file(directory, entities_df):
    output_details = pd.DataFrame(columns=['Text','Company name', 'Investor name','Funding amount','Funded year','Funding type'],index = range(len(data_list)))

    for i,row in entities_df.iterrows():
        output_details.set_value(i,'Text',row[0])
        output_details.set_value(i,'Funding amount',row[4])
        output_details.set_value(i,'Funded year',row[5])
        output_details.set_value(i,'Funding type',row[6])

        if row[1] in ['raise','secure','close','receive','get','complete','attract','announce','launch']:
            output_details.set_value(i,'Company name',row[2])
            output_details.set_value(i,'Investor name',row[3])
        else:
            output_details.set_value(i,'Company name',row[3])
            output_details.set_value(i,'Investor name',row[2])
            
    output_details.to_csv(directory + "\\entities_op.csv")
    return output_details

## Importing necessary libraries and data

In [2]:
import spacy
import pandas as pd
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy.matcher import Matcher
from spacy.tokens import Span 
from spacy import displacy 
import re
from spacy.pipeline import EntityRuler

## Running the model and generating output

In [3]:
## Input given by the user

directory = %pwd
file_name = 'FundingPhrases.xlsx'

In [4]:
## Model running codes

data_excel = read_data(directory,file_name)
data_list = pd_to_list(data_excel)
entities_df = extract_entities(data_list)
output_details = generate_op_file(directory,entities_df)

