In [None]:
from __future__ import annotations
import json
from typing import Literal, TypedDict
from thefuzz import fuzz
from pandas import Series, DataFrame
from constructor import DataframeConstructor

from interfaces import InnerPdData, LangNameData, OriginalCiaLanguageData
import pandas as pd


from FuzzySearcher import FuzzySearcher, FuzzySearcherData


from typing import Any, TypedDict




df_constructor = DataframeConstructor()


def init_pd_settings():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
init_pd_settings()

result = df_constructor.create_base_df(create_data_for_base_df())
base_df = result["base_df"]
df = result["df"]

lang_name_df = df_constructor.create_rest_api_language_name_dataframe(getLangFiles())

new_cia_language_df = df_constructor.create_new_cia_language_dataframe(create_data_for_new_cia_df())
original_wiki_language_df = DataFrame(get_wiki_files())

# Cleaning

In [None]:
default_args = {"to_replace": [r"\bthe\b", r"\bof\b", r"\bThe\b", ","], "value": "", "regex": True}

# clean data
df["wiki_country_id"] = base_df["wiki_country"]
df["cia_country_id"] = base_df["cia_country"]
df["cia_country"] = df['cia_country'].replace(**default_args)
df["wiki_country"] = df['wiki_country'].replace(**default_args)

lang_name_df["common"] = lang_name_df["common"].replace(**default_args)
lang_name_df["official"] = lang_name_df["official"].replace(**default_args)

new_cia_language_df["country"] = new_cia_language_df["country"].replace(**default_args)
original_wiki_language_df["Country/Region"] = original_wiki_language_df["Country/Region"].replace(**default_args)

df

# Join cia and wiki data

- Filter the data by base ratio strategy first then apply token set ratio strategy
- Join the data afterwards

In [None]:
# re-evaluate ratio algorithm

df["similarity"] = df.apply(lambda row: FuzzySearcher("ratio").run(row["cia_country"], row["wiki_country"]), axis=1)
df

In [None]:
# Given how languages have many unique words, token set ratio is unlikely to yield a false positive. All countries that pass are most likely equivalent.


def filter_token_set_less_than_100(row: pd.Series):
    ratio = FuzzySearcher("token_set_ratio").run(row["cia_country"], row["wiki_country"])
    
    row["similarity"] = ratio
    return row

df = df.apply(filter_token_set_less_than_100, axis="columns")
df.sort_values(by="similarity")

In [None]:
# Add exceptions after manual review

exceptions = [117,187,213,182,35]

df["is_exception"] = df.apply(lambda row: row.name in exceptions, axis=1)
df

In [None]:
# Define policy that determines whether or not a country from the cia table has a match in the wiki table
df['has_wiki_country_equivalent'] = df.apply(lambda row: (row["is_exception"] == True) | (row["similarity"] == 100), axis=1)
df

# join previous result with lang table

- Same general strategy as first

In [None]:
lang_name_df

In [None]:

# initialize dependencies
def get_max_country(row):
    max_score = max(row['common_score'], row['common_score_token_set'], row['official_score'], row['official_score_token_set'])
    if max_score == row['common_score'] or max_score == row['common_score_token_set']:
        return row['common_country']
    else:
        return row['official_country']


data_list:list[InnerPdData] = []


# apply same strategy from before, where base ratio strategy is run first before using token ratio strategy (could be refactored into reusable function)

for _, row in df.iterrows():
    
    country:str = row["cia_country"]
    country_id = row["cia_country_id"]
    
    for _, lang_row in lang_name_df.iterrows():
        data:InnerPdData = {
            "common_score":FuzzySearcher("ratio").run(country, lang_row["common"]),
            "common_country":lang_row["common"],
            "common_score_token_set":FuzzySearcher("token_set_ratio").run(country, lang_row["common"]),
            "compared_country":country,
            "official_score":FuzzySearcher("ratio").run(country, lang_row["official"]),
            "official_score_token_set":FuzzySearcher("token_set_ratio").run(country, lang_row["official"]),
            "official_country":lang_row["official"],
            "compared_country_id":country_id
        }
        
        data_list.append(data)


fuzzy_score_df = DataFrame(data_list)

# Label table with maximums derived from fuzzy search scores
fuzzy_score_df['max_country'] = fuzzy_score_df.apply(get_max_country, axis=1)
fuzzy_score_df['max_score'] = fuzzy_score_df[['common_score', 'common_score_token_set', 'official_score', 'official_score_token_set']].max(axis=1)
fuzzy_score_df



In [None]:
# Aggregate view
top_df = fuzzy_score_df.groupby("compared_country_id").apply(lambda x: x.nlargest(1, "max_score")).reset_index(drop=True)
top_df

In [None]:
# View of countries that did not have a perfect match

top_df[top_df["max_score"] != 100].sort_values(by="max_score",ascending=False)

In [None]:
# Define policy for what constitutes a match

top_df["is_match"] = top_df.apply(lambda row: row["max_score"] >= 75, axis=1)
top_df

In [None]:
# join results


joined_df = pd.merge(top_df, df, left_on="compared_country_id", right_on="cia_country_id", how="outer")
joined_df

In [None]:
# double checking keys are still the same since your code modified the keys and i had to recreate the modifications

def apply_fuzzy_search(row: Series):
    
    
    
    results = FuzzySearcher("ratio").run_against_multiple(row["country"], joined_df["cia_country"].tolist())
    
    row["top_similarity"] = results.top_result.similarity
    row["top_country"] = results.top_result.right
    
    
    
    return row
    

new_cia_language_df = new_cia_language_df.apply(apply_fuzzy_search, axis=1)
new_cia_language_df.sort_values(by="top_similarity")

In [None]:
# double checking keys are still the same since your code modified the keys and i had to recreate the transforms in order to join them

def apply_fuzzy_search(row: Series):
    
    
    
    results = FuzzySearcher("ratio").run_against_multiple(row.iloc[1], joined_df["wiki_country"].tolist())
    
    row["top_similarity"] = results.top_result.similarity
    row["top_country"] = results.top_result.right
    
    
    
    return row
    

original_wiki_language_df = original_wiki_language_df.apply(apply_fuzzy_search, axis=1)
original_wiki_language_df.sort_values(by="top_similarity")
