<a href="https://colab.research.google.com/github/sebmaster36/sudanetics/blob/main/Locality_Standardizer_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sudanese Locality Transliteration Standardizer


#### Instructions:

1. Your input file must be inserted into this file. To do this, click the folder icon on the left bar of this page. This will open a “Files” tab. Here, you can drag and drop your desired Excel file.
2. Then look under the “Inputs and Outputs” tab.
3. Now type the file name into the “input_file:” form on the right, spelled exactly the same as the one you dropped in (including the .xlsx extension).
4. Now type the sheet name you want to evaluate into the “sheetname:” form on the right, spelled exactly the same as it shows on the Excel document.
5. Then type the output file name into the “output_file:” form on the right. This can be any name you would like (ending in .xlsx).

To Run File:
1. On the top bar of this screen, click the “Runtime” tab. 
2. From here, select “Run all”.
3. Wait a minute or two and it should be finished!
4. Once the code is finished, your new output file (whose name you specified before you ran the program) will show up on the left next to your input file.
5. From here, you can download the file by double clicking its name.
6. Go into your computer downloads to retrieve the file.
7. Open the file and enjoy!


# Inputs and Outputs

In [1]:
input_file = "Sudan Sample 2.xlsx" #@param {type:"string"}
sheetname = "Sheet4" #@param {type:"string"}
output_file = "sudan_sample_output.xlsx" #@param {type:"string"}

# Imports

In [None]:
!pip install phonetics
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install pyphonetics
!pip install openpyxl

from collections import defaultdict

In [3]:
import doctest
from phonetics import metaphone, nysiis
from fuzzywuzzy import fuzz, process, utils
import math


# Mappings and Datasets

In [4]:
# GEOGRAPHIC TERMS ENGLISH TO ARABIC MAPPING


"""
add to this list of terms as you see fit/encounter new substitutions

this mapping is fed into the tr_geo_terms() function and contributes an accuracy
increase of about 5% by standardizing words into arabic before generating 
phonetic codes

convention is "english": "arabic_xliterated" because of the higher likelihood
that if the english term is used, it will be spelt uniformly... 

-> increases mapping efficacy
"""

GEO_TERMS = {
    'north': 'hamal',
    'south': 'janub',
    'east': 'sharq',
    'west': 'gharb',
    'central': 'wasat',
    'city': 'madina',
    'municipality': 'baladiya',
    'valley': 'wad',
    'river': 'nahr',
    'mountains': 'jabil',
    'desert': 'sahara',
    'sea': 'bahr',
    'new': 'jadid',
    'rural': 'reifi',
    'center': 'wasat,',
    'the': 'al',
    'madeinat' : '',
    'nourth': 'hamal',
    'weast': 'gharb',
    'N ': 'North ',
    'S ': 'South ',
    'C ': 'Central ',
    'W ': 'West ',
    'E ': 'East '
}

In [5]:
# MASTER MAPPING OF LOCALITIES
# Edit here when localities change and shift

MASTER_MAPPING = [
    'Aj Jazirah--Al Hasahisa', 'Aj Jazirah--Al Kamlin',
    'Aj Jazirah--Al Manaqil', 'Aj Jazirah--Al Qurashi',
    'Aj Jazirah--Janub Al Jazirah', 'Aj Jazirah--Medani Al Kubra',
    'Aj Jazirah--Um Algura', 'Aj Jazirah--Sharg Al Jazirah', 'Blue Nile--Baw',
    'Blue Nile--Ed Damazine', 'Blue Nile--Al Kurmuk', 'Blue Nile--Ar Rusayris',
    'Blue Nile--Wad Al Mahi', 'Blue Nile--At Tadamon - BN',
    'Blue Nile--Geisan', 'Central Darfur--Azum', 'Central Darfur--Bendasi',
    'Central Darfur--Mukjar', 'Central Darfur--Gharb Jabal Marrah',
    'Central Darfur--Shamal Jabal Marrah',
    'Central Darfur--Wasat Jabal Marrah', 'Central Darfur--Um Dukhun',
    'Central Darfur--Wadi Salih', 'Central Darfur--Zalingi',
    'East Darfur--Abu Jabrah', 'East Darfur--Abu Karinka',
    'East Darfur--Adila', 'East Darfur--Assalaya', 'East Darfur--Bahr Al Arab',
    "East Darfur--Ad Du'ayn", 'East Darfur--Al Firdous',
    "East Darfur--Shia'ria", 'East Darfur--Yassin', 'Gedaref--Al Fao',
    'Gedaref--Al Qureisha', 'Gedaref--Ar Rahad', 'Gedaref--Al Mafaza',
    'Gedaref--Al Butanah', 'Gedaref--Al Fashaga',
    'Gedaref--Al Galabat Al Gharbyah - Kassab',
    'Gedaref--Galabat Ash-Shargiah', 'Gedaref--Basundah',
    "Gedaref--Gala'a Al Nahal", 'Gedaref--Madeinat Al Gedaref',
    'Gedaref--Wasat Al Gedaref', 'Kassala--Halfa Aj Jadeedah',
    'Kassala--Madeinat Kassala', 'Kassala--Reifi Aroma',
    'Kassala--Reifi Hamashkureib', 'Kassala--Reifi Kassla',
    'Kassala--Reifi Gharb Kassala', 'Kassala--Reifi Khashm Elgirba',
    'Kassala--Reifi Nahr Atbara', 'Kassala--Reifi Shamal Ad Delta',
    'Kassala--Reifi Telkok', 'Kassala--Reifi Wad Elhilaiw', 'Khartoum--Bahri',
    'Khartoum--Khartoum', 'Khartoum--Jebel Awlia', 'Khartoum--Karrari',
    'Khartoum--Um Bada', 'Khartoum--Um Durman', 'Khartoum--Sharg An Neel',
    'River Nile--Abu Hamad', 'River Nile--Al Buhaira', 'River Nile--Atbara',
    'River Nile--Barbar', 'River Nile--Ad Damar', 'River Nile--Al Matama',
    'River Nile--Shendi', 'North Darfur--Al Lait',
    'North Darfur--Dar As Salam', 'North Darfur--Al Fasher',
    'North Darfur--Al Koma', 'North Darfur--Al Malha',
    'North Darfur--As Serief', 'North Darfur--At Tawisha',
    'North Darfur--At Tina', 'North Darfur--Kelemando',
    'North Darfur--Kebkabiya', 'North Darfur--Kernoi', 'North Darfur--Kutum',
    'North Darfur--Melit', 'North Darfur--Saraf Omra', 'North Darfur--Tawila',
    'North Darfur--Um Baru', 'North Darfur--Um Kadadah', 'Northern--Delgo',
    'Northern--Dongola', 'Northern--Al Burgaig', 'Northern--Al Golid',
    'Northern--Ad Dabbah', 'Northern--Halfa', 'Northern--Merwoe',
    'North Kordofan--Bara', 'North Kordofan--Gharb Bara',
    'North Kordofan--Gebrat Al Sheikh', 'North Kordofan--Um Rawaba',
    'North Kordofan--Um Dam Haj Ahmed', 'North Kordofan--Ar Rahad',
    'North Kordofan--Sheikan', 'North Kordofan--Soudari', 'Red Sea--Agig',
    'Red Sea--Al Ganab', "Red Sea--Hala'ib", "Red Sea--Jubayt Elma'aadin",
    'Red Sea--Haya', 'Red Sea--Dordieb', 'Red Sea--Port Sudan',
    'Red Sea--Sawakin', 'Red Sea--Sinkat', 'Red Sea--Tawkar',
    'Sennar--Abu Hujar', 'Sennar--Ad Dali', 'Sennar--Ad Dinder',
    'Sennar--As Suki', 'Sennar--Sharg Sennar', 'Sennar--Sennar',
    'Sennar--Sinja', 'South Darfur--Al Wihda', 'South Darfur--Beliel',
    'South Darfur--Buram', 'South Darfur--Damso', 'South Darfur--Ed Al Fursan',
    'South Darfur--Al Radoum', 'South Darfur--As Salam - SD',
    'South Darfur--Gereida', 'South Darfur--Kas', 'South Darfur--Kateila',
    'South Darfur--Kubum', 'South Darfur--Mershing', 'South Darfur--Nitega',
    'South Darfur--Nyala Janoub', 'South Darfur--Nyala Shimal',
    'South Darfur--Rehaid Albirdi', 'South Darfur--Sharg Aj Jabal',
    'South Darfur--Shattaya', 'South Darfur--As Sunta', 'South Darfur--Tulus',
    'South Darfur--Um Dafoug', 'South Kordofan--Abu Jubayhah',
    'South Kordofan--At Tadamon - SK', 'South Kordofan--Al Quoz',
    'South Kordofan--Dilling', 'South Kordofan--Habila - SK',
    'South Kordofan--Delami', 'South Kordofan--Ar Rashad',
    'South Kordofan--Abu Kershola', 'South Kordofan--Abassiya',
    'South Kordofan--Kadugli', 'South Kordofan--Ar Reif Ash Shargi',
    'South Kordofan--Heiban', 'South Kordofan--Um Durein',
    'South Kordofan--Al Buram', 'South Kordofan--Talawdi',
    'South Kordofan--Al Leri', 'South Kordofan--Ghadeer', 'West Darfur--Beida',
    'West Darfur--Ag Geneina', 'West Darfur--Foro Baranga',
    'West Darfur--Habila - WD', 'West Darfur--Jebel Moon',
    'West Darfur--Kereneik', 'West Darfur--Kulbus', 'West Darfur--Sirba',
    'West Kordofan--Abu Zabad', 'West Kordofan--Al Khiwai',
    'West Kordofan--Abyei PCA area', 'West Kordofan--Abyei',
    'West Kordofan--Al Meiram', 'West Kordofan--Al Dibab',
    'West Kordofan--An Nuhud', 'West Kordofan--As Salam - WK',
    'West Kordofan--Babanusa', 'West Kordofan--Ghubaish',
    'West Kordofan--Al Idia', 'West Kordofan--Keilak',
    'West Kordofan--Al Lagowa', 'West Kordofan--As Sunut',
    'West Kordofan--Wad Bandah', 'White Nile--Ad Diwaim',
    'White Nile--Al Gitaina', 'White Nile--Aj Jabalain',
    'White Nile--As Salam / Ar Rawat', 'White Nile--Kosti', 'White Nile--Guli',
    'White Nile--Um Rimta', 'White Nile--Rabak', 'White Nile--Tendalti'
]

In [6]:
# STATE NAMES

"""
description
"""

STATES = {'White Nile', 'River Nile', 'Northern', 'Central Darfur', 'Red Sea', 'East Darfur', 'North Kordofan', 
          'Sennar', 'South Kordofan', 'West Kordofan', 'Kassala', 'Blue Nile', 'North Darfur', 'South Darfur', 'West Darfur', 'Gedaref', 'Aj Jazirah', 'Khartoum'}

#STATES = { state.split("--")[0] for state in MASTER_MAPPING}


In [7]:
# PROVINCE NAMES

# PROVINCES = defaultdict(list)

# for line in MASTER_MAPPING:
#   state, locality = line.split("--")

#   PROVINCES["state"].append(locality)


PROVINCES = {'Aj Jazirah': ['Al Hasahisa', 'Al Kamlin', 'Al Manaqil', 'Al Qurashi', 'Janub Al Jazirah', 'Medani Al Kubra', 'Um Algura', 'Sharg Al Jazirah'],
          'Blue Nile': ['Baw', 'Ed Damazine', 'Al Kurmuk', 'Ar Rusayris', 'Wad Al Mahi', 'At Tadamon - BN', 'Geisan'],
          'Central Darfur': ['Azum', 'Bendasi', 'Mukjar', 'Gharb Jabal Marrah', 'Shamal Jabal Marrah', 'Wasat Jabal Marrah', 'Um Dukhun', 'Wadi Salih', 'Zalingi'],
          'East Darfur': ['Abu Jabrah', 'Abu Karinka', 'Adila', 'Assalaya', 'Bahr Al Arab', "Ad Du'ayn", 'Al Firdous', "Shia'ria", 'Yassin'],
          'Gedaref': ['Al Fao', 'Al Qureisha', 'Ar Rahad', 'Al Mafaza', 'Al Butanah', 'Al Fashaga', 'Al Galabat Al Gharbyah - Kassab', 'Galabat Ash-Shargiah', 'Basundah', "Gala'a Al Nahal", 'Madeinat Al Gedaref', 'Wasat Al Gedaref'],
          'Kassala': ['Halfa Aj Jadeedah', 'Madeinat Kassala', 'Reifi Aroma', 'Reifi Hamashkureib', 'Reifi Kassla', 'Reifi Gharb Kassala', 'Reifi Khashm Elgirba', 'Reifi Nahr Atbara', 'Reifi Shamal Ad Delta', 'Reifi Telkok', 'Reifi Wad Elhilaiw'],
          'Khartoum': ['Bahri', 'Khartoum', 'Jebel Awlia', 'Karrari', 'Um Bada', 'Um Durman', 'Sharg An Neel'], 
          'River Nile': ['Abu Hamad', 'Al Buhaira', 'Atbara', 'Barbar', 'Ad Damar', 'Al Matama', 'Shendi'], 
          'North Darfur': ['Al Lait', 'Dar As Salam', 'Al Fasher', 'Al Koma', 'Al Malha', 'As Serief', 'At Tawisha', 'At Tina', 'Kelemando', 'Kebkabiya', 'Kernoi', 'Kutum', 'Melit', 'Saraf Omra', 'Tawila', 'Um Baru', 'Um Kadadah'], 
          'Northern': ['Delgo', 'Dongola', 'Al Burgaig', 'Al Golid', 'Ad Dabbah', 'Halfa', 'Merwoe'], 
          'North Kordofan': ['Bara', 'Gharb Bara', 'Gebrat Al Sheikh', 'Um Rawaba', 'Um Dam Haj Ahmed', 'Ar Rahad', 'Sheikan', 'Soudari'], 
          'Red Sea': ['Agig', 'Al Ganab', "Hala'ib", "Jubayt Elma'aadin", 'Haya', 'Dordieb', 'Port Sudan', 'Sawakin', 'Sinkat', 'Tawkar'], 
          'Sennar': ['Abu Hujar', 'Ad Dali', 'Ad Dinder', 'As Suki', 'Sharg Sennar', 'Sennar', 'Sinja'], 
          'South Darfur': ['Al Wihda', 'Beliel', 'Buram', 'Damso', 'Ed Al Fursan', 'Al Radoum', 'As Salam - SD', 'Gereida', 
                           'Kas', 'Kateila', 'Kubum', 'Mershing', 'Nitega', 'Nyala Janoub', 'Nyala Shimal', 'Rehaid Albirdi', 'Sharg Aj Jabal', 'Shattaya', 'As Sunta', 'Tulus', 'Um Dafoug'], 
          'South Kordofan': ['Abu Jubayhah', 'At Tadamon - SK', 'Al Quoz', 'Dilling', 'Habila - SK', 'Delami', 'Ar Rashad', 
                             'Abu Kershola', 'Abassiya', 'Kadugli', 'Ar Reif Ash Shargi', 'Heiban', 'Um Durein', 'Al Buram', 'Talawdi', 'Al Leri', 'Ghadeer'], 
          'West Darfur': ['Beida', 'Ag Geneina', 'Foro Baranga', 'Habila - WD', 'Jebel Moon', 'Kereneik', 'Kulbus', 'Sirba'], 
          'West Kordofan': ['Abu Zabad', 'Al Khiwai', 'Abyei PCA area', 'Abyei', 'Al Meiram', 'Al Dibab', 'An Nuhud', 'As Salam - WK', 'Babanusa', 'Ghubaish', 'Al Idia', 'Keilak', 'Al Lagowa', 'As Sunut', 'Wad Bandah'], 
          'White Nile': ['Ad Diwaim', 'Al Gitaina', 'Aj Jabalain', 'As Salam / Ar Rawat', 'Kosti', 'Guli', 'Um Rimta', 'Rabak', 'Tendalti']}

# assert dict(PROVINCES) == PROVINCES_0

# Processing Fuctions

In [None]:
def tr_geo_terms(s: str):
  """replace any instances of english geographical terms with arabic

  >>> tr_geo_terms("River Atbara")
  'nahr Atbara'

  >>> tr_geo_terms("North Dalta")
  'hamal Dalta'

  >>> tr_geo_terms('Saraf Omra')
  'Saraf Omra'
  """
  
  return " ".join(
      GEO_TERMS.get(fragment.lower(), fragment) for fragment in s.split()
  )

doctest.testmod()

In [9]:
def confidence_score(scores):

  # Set maxSim to the best similarity found
  maxSim = max(scores)

  # Calculate the denominator
  den = sum([math.exp(1 - math.pow(maxSim / score, 3)) for score in scores])

  # Calculate the confidence score for each score
  confidences = [math.exp(1 - math.pow(maxSim / score, 3)) / den for score in scores]

  return confidences

In [14]:
def custom_scorer(s1_raw: str, s2_raw: str) -> int:
  """
  return a measure of the sequences' similarity between 0 and 100, using different algorithms.

  passed as an argument to process.extract* functions in order to define a custom 
  set of criteria for determining string similarity

  steps:
  1. process strings (remove whitespace, non-alpha, etc.) and check for corner cases
  2. replace geo terms if present
  3. compute baseline score using only fuzzy match
  4. compute similarity score using fuzzy on phonetic codes
  5. return a weighted average of phonetics and fuzzy

  areas for improvement: missing words in name,
  differentiating same name in different states

  """

  s1, s2 = tuple(map(utils.full_process, (s1_raw, s2_raw)))

  if not s1 or not s2:
    print(s1_raw, s2_raw)
    return 0

  s1, s2 = tr_geo_terms(s1), tr_geo_terms(s2) # 3. substitution of terms added 5 percent accuracy 

  fuzzy_similarity = fuzz.token_sort_ratio(s1,s2) # 1. added 11 percent accuracy

  phonetic_similarity = fuzz.ratio(
      metaphone(s1), metaphone(s2)
  )
  
  # 2. weighted factor added 5%
  return int(fuzzy_similarity * 0.67 + phonetic_similarity * 0.33)

In [None]:
help(utils.full_process)

In [12]:
def get_prediction(inputs, standards):
  """
  given a list of inputs, returns a tuple-list of predictions and confidence score
  """

  output = []

  for locality in inputs:
    guesses = process.extractBests(locality, standards, scorer=custom_scorer, limit=7)

    confidences = confidence_score(
        list(map(lambda x: x[-1], guesses))
    )

    guess = guesses[0][0]
    confidence = confidences[0]

    #print(f"predicted *{guess}* with confidence of {confidences[0] * 100:.2f}% {'correctly' if guess == labeled_inputs.get(locality) else 'incorrectly'}")

    output.append(
        (guess, confidence)
    )

  return output

# Implementation

In [16]:
import openpyxl
from openpyxl import load_workbook
from openpyxl import styles
from openpyxl.styles import Color, PatternFill, Font, Border
from copy import copy

dataframe = load_workbook(filename=input_file)

for sheet in range(len(dataframe.sheetnames)):
    if dataframe.sheetnames[sheet] == sheetname:
        break

dataframe.active = sheet
sheet = dataframe.active

header = True
data   = []
other  = []
count = 0
for value in sheet.iter_rows(min_col=1, max_col=2, values_only=True):
  if header:
    header = False
    continue
  if value[0] == None or value[1] == None:
    break
  
  province = value[0]
  district = value[1]

  other.append(province)
  data.append(district)

# print(data)
# print(other)
# print(len(data))

sheet.insert_cols(1, 1)
column_number = 1
column = str(chr(64 + column_number))
sheet.column_dimensions[column].width = 30

sheet.cell(row=1,column=1).font = copy(sheet['B1'].font)
sheet.cell(row=1,column=1).fill = copy(sheet['B1'].fill)

sheet['A1'] = 'Mappings'

upper   = get_prediction(other, STATES)

red = PatternFill(start_color='FF0000', end_color='FF0000', fill_type = 'solid')
orange = PatternFill(start_color='FF8000', end_color='FF0000', fill_type = 'solid')

index = 2

for ip in range(len(data)):

  # Province > District
  province = upper[ip]
  district = data[ip]

  result = get_prediction([district], PROVINCES[province[0]])

  sheet[f'A{index}'] = f'{province[0]}--{result[0][0]}'
  
  if result[0][1] < 0.20:
    sheet.cell(row=index,column=1).fill = red
  elif result[0][1] < 0.30:
    sheet.cell(row=index,column=1).fill = orange
  index += 1

dataframe.save(filename=output_file)

ZeroDivisionError: ignored