In [2]:
# ------------------------
# Imports
# ------------------------

!pip install flair
import pandas as pd
from flair.nn import Classifier
from flair.data import Sentence
from flair.splitter import SegtokSentenceSplitter

# Mount drive to save results
from google.colab import drive
drive.mount('/content/drive')

# -------------------------------
# Custom NER functions
# -------------------------------

def split_sentences(text):

  split_list = []

  for i, ele in enumerate(text):

    # use splitter to split text into list of sentences
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(ele)

    split_list.append({'id': i, 'input_text': ele, 'sentences': sentences})

  split_df = pd.DataFrame(split_list)

  return split_df

def run_NER(df):
  '''
  Attributes:
    df: pandas DataFrame with a unique identifier column (id), and a list-column with Flair sentences corresponding to the id (specifically designed to take the output of split_sentences)
  '''
  ner_results = []

  for index, row in df.iterrows():

    sentences = row['sentences']
    
    # predict tags for sentences
    tagger.predict(sentences)

    for sentence in sentences:
      sentence_results = [{'id': row['id'], 'text': x.data_point.text, 'entity_detected': x.value, 'score': x.score} for x in sentence.get_labels('ner')]

      if sentence_results:
        ner_results += sentence_results

  return pd.DataFrame(ner_results)


In [4]:
# -------------
# Load model
# -------------

tagger = Classifier.load('ner-ontonotes-large')

# ---------------
# COVID dataset
# ---------------

covid_usr = pd.read_csv('https://raw.githubusercontent.com/sullivannicole/simplER/main/data/covid_raw_user_sentences.csv')
covid_usr = covid_usr.rename(columns={covid_usr.columns[0]: 'id'})
covid_usr.to_csv('drive/MyDrive/GitHub/simplER/data/covid_raw_user_sentences_w_ids.csv') # save dataset w/ IDs for reference

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2023-04-12 02:39:55,272 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [24]:
sentence_txt = covid_usr.sentence.values
split_df = split_sentences(sentence_txt)
ner_results = run_NER(split_df)
ner_results.to_csv('drive/MyDrive/GitHub/simplER/data/covid_user_flair.csv')

