In [12]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_lg")

In [72]:
class NERDString:
    
    def __init__(self, text:str, nlp = nlp):
        
        # self.nlp = spacy.load("en_core_web_lg")
    
        # text = text.upper()

        self.nlp = nlp

        self.doc = self.nlp(text)
        
        self.extract_ner()

        self.ner_extracts = dict()
#         for ent in self.entities.entity.unique():
#             # print(ent)
#             self.ner_extracts[ent] = self.summarize_entity_type(ent)
            
    def extract_ner(self):
        ner_tuplist = []
        for ent in self.doc.ents:
            ner_tuplist.append((ent.text, ent.label_))
            
        self.entities = pd.DataFrame(ner_tuplist, columns = ['name', 'entity'])
        
    def summarize_entity_type(self, entity_type:str):
        df_counts = pd.DataFrame(self.entities.loc[self.entities.entity == entity_type].groupby(['name', 'entity'])['name'].count())
        # t.reset_index()
        df_counts.columns = ['counts']
        df_counts.reset_index()
        df_counts = df_counts.sort_values('counts', ascending=False)
        df_counts.reset_index(inplace=True)
        df_counts.reset_index(drop=True, inplace=True)
        return df_counts[['name', 'counts']]

In [73]:
ns = NERDString('Create a search for the scientist Ryan Willett who attended Columbia University after 2003-02-11 and before 2012 on www.brandeis.edu')

In [74]:
ns.doc

Create a search for the scientist Ryan Willett who attended Columbia University after 2003-02-11 and before 2012 on www.brandeis.edu

In [75]:
ns.entities

Unnamed: 0,name,entity
0,Ryan Willett,PERSON
1,Columbia University,ORG
2,2003-02-11,DATE
3,2012,DATE


Unnamed: 0,name,entity
2,2003-02-11,DATE
3,2012,DATE


In [100]:
def extract_date(ns):
    import re

    dates_df = ns.entities.loc[ns.entities.entity == 'DATE']

    date_series = dates_df.name.apply(lambda x: pd.to_datetime(re.sub('[A-Z]', '', x.upper())))
    min_date_dt = date_series.min()
    max_date_dt = date_series.max()

    min_date = str(min_date_dt).split()[0]
    max_date = str(max_date_dt).split()[0]

    return (f'Min date: {min_date}\nMax date: {max_date}')

In [101]:
print(extract_date(ns))

Min date: 2003-02-11
Max date: 2012-01-01


In [117]:
urls = 'www.google.com.  www.columbia.edu  rtwillett.github.io'.lower()

In [124]:
[url for url in re.findall('[a-z0-9-_.]*\.com|edu', urls) if url.strip() != '']

['www.google.com', 'edu']

In [51]:
ns.ner_extracts

{'PERSON':            name  counts
 0  Ryan Willett       1, 'ORG':                   name  counts
 0  Columbia University       1, 'DATE':          name  counts
 0  2003-01-01       1}

In [52]:
ns.ner_extracts['PERSON']

Unnamed: 0,name,counts
0,Ryan Willett,1
