# Province Gender Available - Entity Linking with Preprocessing

In [1]:
import gzip
import json
import pandas as pd
from fuzzywuzzy import fuzz
from collections import Counter
from string import punctuation



## Read Entities & Create Ids for each Entity

In [2]:
entity_linking_list = pd.read_excel("entity_linking_list.xlsx")

In [3]:
entities = list(entity_linking_list.columns)

In [4]:
ids_to_entities = dict(zip(entities, list(range(1, len(entities)+1))))
ids_to_entities

{'Ahmet Davutoğlu': 1,
 'Ali Babacan': 2,
 'Ekrem İmamoğlu': 3,
 'Gültekin Uysal': 4,
 'Kemal Kılıçdaroğlu': 5,
 'Mansur Yavaş': 6,
 'Meral Akşener': 7,
 'Recep Tayyip Erdoğan': 8,
 'Selahattin Demirtaş': 9,
 'Temel Karamollaoğlu': 10}

In [5]:
names = list(ids_to_entities.keys())
names

['Ahmet Davutoğlu',
 'Ali Babacan',
 'Ekrem İmamoğlu',
 'Gültekin Uysal',
 'Kemal Kılıçdaroğlu',
 'Mansur Yavaş',
 'Meral Akşener',
 'Recep Tayyip Erdoğan',
 'Selahattin Demirtaş',
 'Temel Karamollaoğlu']

In [6]:
entity_linking_list_json = [{"entity_id": i+1, "entity_canonical_form": names[i], "entity_variations": names[i]} for i in range(10)]
entity_linking_list_json

[{'entity_id': 1,
  'entity_canonical_form': 'Ahmet Davutoğlu',
  'entity_variations': 'Ahmet Davutoğlu'},
 {'entity_id': 2,
  'entity_canonical_form': 'Ali Babacan',
  'entity_variations': 'Ali Babacan'},
 {'entity_id': 3,
  'entity_canonical_form': 'Ekrem İmamoğlu',
  'entity_variations': 'Ekrem İmamoğlu'},
 {'entity_id': 4,
  'entity_canonical_form': 'Gültekin Uysal',
  'entity_variations': 'Gültekin Uysal'},
 {'entity_id': 5,
  'entity_canonical_form': 'Kemal Kılıçdaroğlu',
  'entity_variations': 'Kemal Kılıçdaroğlu'},
 {'entity_id': 6,
  'entity_canonical_form': 'Mansur Yavaş',
  'entity_variations': 'Mansur Yavaş'},
 {'entity_id': 7,
  'entity_canonical_form': 'Meral Akşener',
  'entity_variations': 'Meral Akşener'},
 {'entity_id': 8,
  'entity_canonical_form': 'Recep Tayyip Erdoğan',
  'entity_variations': 'Recep Tayyip Erdoğan'},
 {'entity_id': 9,
  'entity_canonical_form': 'Selahattin Demirtaş',
  'entity_variations': 'Selahattin Demirtaş'},
 {'entity_id': 10,
  'entity_canoni

In [7]:
with open("entity_linking_list_json.json", "w") as f:
    json.dump(entity_linking_list_json, f)

## Read Tweets with Entities

In [8]:
with gzip.open("all_tweets_unique_id_txt-220614_wEntities.json.gz", "rb") as f:
    all_tweets_ent = f.readlines()

In [9]:
for i in range(len(all_tweets_ent)):
    all_tweets_ent[i] = json.loads(all_tweets_ent[i])

In [10]:
len(all_tweets_ent)

4615920

## Preprocessing

In [11]:
def preprocessing(x):
    """
    - Lowercase Turkish characters (Ç, Ğ, I, İ, Ö, Ş, Ü)
    - Lowercase all characters
    - Remove leading and trailing punctuation
    - Remove everything after apostrophe (' and ’)
    - Remove non-alphabetic characters
    """
    
    x = x.replace("Ç", "ç").replace("Ğ", "ğ").replace("I", "ı").replace("İ", "i").replace("Ö", "ö").replace("Ş", "ş").replace("Ü", "ü")
    x = x.lower()
    x = x.strip(punctuation)
    x = x.split("'")[0]
    x = x.split("’")[0]
    x = ''.join([letter for letter in x if letter.isalpha() or letter == " "])
    
    return x

In [12]:
names_processed = [preprocessing(name) for name in names]
names_processed

['ahmet davutoğlu',
 'ali babacan',
 'ekrem imamoğlu',
 'gültekin uysal',
 'kemal kılıçdaroğlu',
 'mansur yavaş',
 'meral akşener',
 'recep tayyip erdoğan',
 'selahattin demirtaş',
 'temel karamollaoğlu']

## Preprocessing & Matching Entities with Names

- Ratio used for similarity (*fuzzywuzzy.ratio*): **80%**

In [13]:
# For each tweet
for i, tweet in enumerate(all_tweets_ent):
    if i % 250_000 == 0:
        print(f"{i}/{len(all_tweets_ent)} | {(i+1)/len(all_tweets_ent)*100:.2f}%")
    # For each entity in a tweet
    for entity in tweet["entities"]:
        # For each (preprocessed) name
        for k in range(len(names_processed)):
            # If the similarity between name & entity is higher than 80%
            if fuzz.ratio(names_processed[k], preprocessing(entity[1])) >= 80:
                # Append the id of related name
                entity.append(k+1)
        # Af there is no third element in the entity list, then append 0
        if len(entity) < 3:
            entity.append(0)
print(f"{len(all_tweets_ent)}/{len(all_tweets_ent)} | {len(all_tweets_ent)/len(all_tweets_ent)*100:.2f}%")

0/4615920 | 0.00%
250000/4615920 | 5.42%
500000/4615920 | 10.83%
750000/4615920 | 16.25%
1000000/4615920 | 21.66%
1250000/4615920 | 27.08%
1500000/4615920 | 32.50%
1750000/4615920 | 37.91%
2000000/4615920 | 43.33%
2250000/4615920 | 48.74%
2500000/4615920 | 54.16%
2750000/4615920 | 59.58%
3000000/4615920 | 64.99%
3250000/4615920 | 70.41%
3500000/4615920 | 75.82%
3750000/4615920 | 81.24%
4000000/4615920 | 86.66%
4250000/4615920 | 92.07%
4500000/4615920 | 97.49%
4615920/4615920 | 100.00%


In [14]:
with gzip.open("all_tweets_unique_id_txt-220614_wEntityLinking.json.gz", "ab") as f:
    for tweet in all_tweets_ent:
        f.write(f"{json.dumps(tweet)}\n".encode('utf-8'))