# Submission for [Geographic Named Entity Recognition](https://github.com/1712n/challenge/issues/65) challenge

## Required libraries and functions:

In [1]:
!pip install transformers --quiet
!pip install truecase --quiet
!pip install transliterate --quiet

[K     |████████████████████████████████| 4.7 MB 4.2 MB/s 
[K     |████████████████████████████████| 120 kB 61.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 40.8 MB/s 
[K     |████████████████████████████████| 28.4 MB 1.3 MB/s 
[K     |████████████████████████████████| 118 kB 4.3 MB/s 
[K     |████████████████████████████████| 45 kB 1.4 MB/s 
[?25h

In [None]:
import re
from pathlib import Path

import nltk
import pandas as pd
import transformers
import transliterate
import truecase
from google.colab import drive, files

nltk.download('punkt')

In [None]:
PROJECT_DIR = "/content/gdrive/MyDrive/Inca"
DATA_DIR = Path(PROJECT_DIR) / 'data'
DATASET_PATH = Path(DATA_DIR) / 'Twitter-Bio-Location.txt'
CITIES_PATH = Path(DATA_DIR) / 'cities.csv'
drive.mount("/content/gdrive")

In [4]:
def file_to_list(filepath):
    result = []
    with open(filepath) as f:
        for line in f.readlines():
            result.append(line.strip())
    return result

In [5]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U0001f926-\U0001f937"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
                      "]+", re.UNICODE)
def _remove_emoji(text):
    return re.sub(emoj, '', text)

def clean_text(text: str):
    """Cleans the text from unnecessary symbols and emoji"""
    cleaned_text = re.sub(r'[\[\]*?!@#$]', '', text)
    cleaned_text = _remove_emoji(cleaned_text)
    cleaned_text = cleaned_text.replace(",", ", ")
    return cleaned_text

In [6]:
def _fix_letter_case(text: str):
    """Applies 'truecase' to the text if it is is written
    entirely in uppercase or lowercase.
    """
    if text.isupper() or text.islower():
        return truecase.get_true_case(text)
    return text

def _extract_geo_with_score(tokens_info):
    scores = [token_info["score"] for token_info in tokens_info]
    highest_score = max(scores)
    words = " ".join([token_info["word"] for token_info in tokens_info])
    return words, highest_score


def make_pipeline_prediction(row, pipeline, column_name: str):
    text = row['cleaned']
    if not text:
        model_result_text, highest_score = "MISSING", 0
    else:
        true_cased_text = _fix_letter_case(text)
        cased_result = pipeline(true_cased_text)
        if not cased_result:
            model_result_text, highest_score = "MISSING", 0
        else:
            model_result_text, highest_score = _extract_geo_with_score(cased_result)
    row[column_name + '_prediction'] = model_result_text
    row[column_name + '_score'] = highest_score
    return row

In [8]:
def check_coordinates(text: str):
    """Checks if 'text' contains coordinates."""
    coordinates_decimal = r"""[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?),\s*[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$"""
    coordinates_dms = r"""([0-8]?\d(°|\s)[0-5]?\d('|\s)[0-5]?\d(\.\d{1,6})?"?|90(°|\s)0?0('|\s)0?0"?)\s{0,}[NnSs]\s{1,}([0-1]?[0-7]?\d(°|\s)[0-5]?\d('|\s)[0-5]?\d(\.\d{1,6})?"?|180(°|\s)0?0('|\s)0?0"?)\s{0,}[EeOoWw]"""
    coordinates = re.compile("|".join([coordinates_decimal, coordinates_dms]))
    return bool(re.search(coordinates, text))

In [13]:
def check_in_set(text, check_set):
    """Checks if 'check_set' contains 'text' or 'text'
    without word of len < 2 or translited to English 'text'.
    """
    text = text.lower()
    if text in check_set:
        return True
    cropped_text = " ".join([word for word in text.split() if len(word) > 2])
    if cropped_text in check_set:
        return True
    try:
        translit_text = transliterate.translit(text, reversed=True)
        if translit_text in check_set:
            return True
    except:
        pass
    return False

In [91]:
def _make_average_prediction(model_predictions, model_scores, check_set):
    """Makes average prediction based on model predictions.
    Result is 'MISSING' in these cases:
     - if all models consider that the text does not contain geolocation.
     - if two of the models consider that the text does not contain geolocation, 
       and the confidence of the third model is less than 0.9
     - if one model consider that the text does not contain geolocation,
       there are two options:
        - other two models predict the same result. In such case
          prediction is 'MISSING' if both confidences less than 0.7
        - other two models predict different results. In such case
          prediction is 'MISSING' if max confidence less than 0.85
     If at least one model finds geolocation, predictions are checked
     for inclusion in the 'check_set'.
     If there are not missing counts, the prediction with the highest confidence
     is returned.
     """
    missing_count = sum([1 if result == "MISSING" else 0 for result in model_predictions]) 
    if missing_count == 3:
        return "MISSING"
    highest_score = max(model_scores)
    highest_score_index = model_scores.index(highest_score)
    highest_score_prediction = model_predictions[highest_score_index]
    for model_prediction in model_predictions:
        if check_in_set(model_prediction, check_set):
            return model_prediction
    if missing_count == 0:
        return highest_score_prediction
    if missing_count == 1:
        if highest_score > 0.85:
            return highest_score_prediction
        missing_model_index = model_predictions.index("MISSING")
        not_missing_model_index_1, not_missing_model_index_2 = (i for i in range(len(model_predictions)) if i != missing_model_index)
        model_pred_1, model_score_1 = model_predictions[not_missing_model_index_1], model_scores[not_missing_model_index_1]
        model_pred_2, model_score_2 = model_predictions[not_missing_model_index_2], model_scores[not_missing_model_index_2]
        if model_pred_1.lower() == model_pred_2.lower() and (model_score_1 > 0.7 or model_score_2 > 0.7):
            return model_pred_1
    if missing_count == 2 and highest_score > 0.9:
        return highest_score_prediction
    return "MISSING"

def make_average_prediction(row, check_set: set):
    """Add prediction to the row. Firstly checks if the text contains
    coordinates or 'check_set' contains the text. Otherwise apply the models.
    See '_make_average_prediction' docs for models predictions details.
    """
    text = row['cleaned']
    if check_coordinates(text):
        row["prediction"] = text
    else:
        model_predictions = [row['bert_cased_prediction'], row['bert_uncased_prediction'], row['babel_cased_prediction']]
        model_scores = [row['bert_cased_score'], row['bert_uncased_score'], row['babel_cased_score']]
        row["prediction"] = _make_average_prediction(model_predictions, model_scores, check_set)
    return row

___

## Preparing the data and loading of the models:

In [118]:
dataset = file_to_list(DATASET_PATH)
df = pd.DataFrame(dataset, columns=["initial"])
cities_df = pd.read_csv(CITIES_PATH)
check_set = set(cities_df.name.str.lower())

In [123]:
df["cleaned"] = df["initial"].apply(clean_text)

In [None]:
labels_to_ignore = ["PER", "ORG", "O", "MISC"]

bert_tokenizer = transformers.AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
bert_model = transformers.AutoModelForTokenClassification.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
bert_cased_pipeline = transformers.pipeline("ner", model=bert_model, tokenizer=bert_tokenizer, aggregation_strategy ="average", ignore_labels=labels_to_ignore)

# https://huggingface.co/Babelscape/wikineural-multilingual-ner
babel_tokenizer = transformers.AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
babel_model = transformers.AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
babel_pipeline = transformers.pipeline("ner", model=babel_model, tokenizer=babel_tokenizer, aggregation_strategy ="average", ignore_labels=labels_to_ignore)

bert_uncased_tokenizer = transformers.AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
bert_uncased_model = transformers.AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
bert_uncased_pipeline = transformers.pipeline("ner", model=bert_uncased_model, tokenizer=bert_uncased_tokenizer, aggregation_strategy ="average", ignore_labels=labels_to_ignore)

## Making predictions:

In [124]:
df = df.apply(make_pipeline_prediction, args=(bert_cased_pipeline, "bert_cased"), axis=1)

In [125]:
df = df.apply(make_pipeline_prediction, args=(bert_uncased_pipeline, "bert_uncased"), axis=1)

In [126]:
df = df.apply(make_pipeline_prediction, args=(babel_pipeline, "babel_cased"), axis=1)

In [127]:
df = df.apply(make_average_prediction, args=(check_set,), axis=1)

In [129]:
final_df = pd.DataFrame({"initial": df.initial, "prediction": df.prediction})

In [130]:
final_df

Unnamed: 0,initial,prediction
0,@ShortStintatPlanet-Earth-UK,UK
1,"**302** Wilmington, Delaware",Wilmington Delaware
2,#Blockchain,MISSING
3,#Cryptoverse!,MISSING
4,#Earth #Europe #Germany #NRW,europe germany
...,...,...
495,भारत,भारत
496,চাঁপাইনবাবগঞ্জ,চাঁপাইনবাবগঞ্জ
497,スワップ部屋,MISSING
498,太阳系第三行星-中国四川,太 阳 系 中 国 四 川


In [131]:
final_df.to_csv("prediction.csv")
files.download("prediction.csv")