# Surgeo Code Walkthrough with Additional Steps
<b> Emilio Ramos Monzalvo - 09/01/2021 </b>

## Config

In [1]:
import pathlib
import string
import sys
from tqdm import tqdm

import numpy as np
import pandas as pd

In [2]:
import geopandas

from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

import folium
from folium.plugins import FastMarkerCluster
from folium.utilities import if_pandas_df_convert_to_numpy, validate_location

ModuleNotFoundError: No module named 'geopandas'

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # cosine matching

from fuzzywuzzy import fuzz
import fuzzywuzzy.process as fuzz_process

## Global Variables

In [None]:
RACE_COLS = ['white', 'black', 'api', 'native', 'multiple', 'hispanic']

## Read in Data

### Probability of First Name Given Race

In [None]:
fn_g_r = pd.read_csv(
            'Data/prob_first_name_given_race_harvard.csv',
            index_col='name',
            na_values=[''],
            keep_default_na=False,
        )
print('Number of Rows: ', len(fn_g_r))
fn_g_r.head()

### Probability of Race Given Surname

In [None]:
r_g_sn = pd.read_csv(
            'Data/prob_race_given_surname_2010.csv',
            index_col='name',
            na_values=[''],
            keep_default_na=False,
        )
print('Number of Rows: ', len(r_g_sn))
r_g_sn.head()

### Probability of ZCTA Given Race

In [None]:
zcta_g_race = pd.read_csv(
            'Data/prob_zcta_given_race_2010.csv',
            index_col='zcta5',
            na_values=[''],
            keep_default_na=False,
        )
print('Number of Rows: ', len(zcta_g_race))
zcta_g_race.head()

## PreProcessing

### Names (First and Last Names)

<b> Here we would need to add a way to approximate how similar a name is to another if it does not exist on the table. </b>
* Use Fuzzy matching to get the closest name to an already existing name.
    - Inneficient computation time (How many will we be processing? I.e., is computation time important?)
    - Does not take into consideration the roots of the names. It only finds the closest related name instead.
* Use bigrams or other methods to include roots of the names for better matching.
    - Cosine Mathcing using Bigrams:
    - https://towardsdatascience.com/surprisingly-effective-way-to-name-matching-in-python-1a67328e670e

In [None]:
def preprocess_names(names: pd.Series) -> pd.Series:
        """Take names and run a normalization routine"""
        
        # Make a transalation table of unwanted characers
        unwanted_characters = (
            string.digits +
            string.punctuation +
            string.whitespace
        )
        
        # Remove unwanted characters efficiently
        translation_table =  str.maketrans('', '', unwanted_characters)
        
        # Run our string operations (remember NAN is a valid name)
        output = (
            names.fillna('')
                 .astype(str)
                 .str.translate(translation_table)
                 .str.upper()
                 .str.replace(r'\s?J\.*?R\.*\s*?$', '', regex=True)
                 .str.replace(r'\s?S\.*?R\.*\s*?$', '', regex=True)
                 .str.replace(r'\s?III\s*?$',      '', regex=True)
                 .str.replace(r'\s?IV\s*?$',       '', regex=True)
        )
        output.name = 'name'
        
        return output

In [None]:
# Pre Process / Normalize Names
fn_g_r_norm = fn_g_r.copy()
fn_g_r_norm.index = preprocess_names(fn_g_r_norm.index.to_series())

r_g_sn_norm = r_g_sn.copy()
r_g_sn_norm.index = preprocess_names(r_g_sn_norm.index.to_series())

### ZCTAS

<b> What if the address provided does not have a Zip Code? Or What if it does not appear in the data used to create the population summary?</b>

In [None]:
def preprocess_zctas(zcta: pd.Series) -> pd.Series:
        """Transform ZCTAs into standardized strings"""
        converted = pd.Series(zcta.values, dtype=str).str.strip()
        zfilled = converted.str.zfill(5)
        zfilled.name = 'zcta5'
        return zfilled

In [None]:
# Pre Process / Normalize Zip Codes
zcta_g_race_norm = zcta_g_race.copy()
zcta_g_race_norm.index = preprocess_zctas(zcta_g_race_norm.index.to_series())

## Get Probabilities Based on Names

### Get The Most Similiar Name Already Known

In [None]:
def preprocess_names_tfidf(new_names: pd.Series, old_names: pd.Series, NGRAMS=2):
    
    # Vectorize using TfidVectorizer and Ngrams
    vect = TfidfVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False)
    tf_idf_matrix_old = vect.fit_transform(old_names)
    tf_idf_matrix_new = vect.transform(new_names)
    
    return tf_idf_matrix_new, tf_idf_matrix_old

def cosine_similarity_topn(A: csr_matrix, B: csr_matrix, topn: int = 10, lower_bound: int=0) -> csr_matrix:
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M*topn

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
                M, N, np.asarray(A.indptr, dtype=idx_dtype),
                np.asarray(A.indices, dtype=idx_dtype),
                A.data,
                np.asarray(B.indptr, dtype=idx_dtype),
                np.asarray(B.indices, dtype=idx_dtype),
                B.data,
                topn,
                lower_bound,
                indptr, indices, data)

        return csr_matrix((data,indices,indptr),shape=(M,N))
    
def get_matches_df(sparse_matrix: csr_matrix, name_vector_a: pd.Series, 
                   name_vector_b: pd.Series , topn: int=-1,
                  max_match=100) -> pd.DataFrame:
    """Unpack Sparse Matrix Matches into a Data Frame"""
    
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if topn > 0:
        nr_matches = topn
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)

    for index in tqdm(range(0, nr_matches), desc="Computing Cosine Similarities..."):
        left_side[index] = name_vector_a[sparserows[index]]
        right_side[index] = name_vector_b[sparsecols[index]]
        similairity[index] = np.floor(sparse_matrix.data[index]*100)
    
    matches = pd.DataFrame({'name': right_side,
                          'match': left_side,
                           'match_score': similairity})
    
    matches = matches.loc[(matches['match_score'] <= max_match)]\
                            .sort_values(['name', 'match_score'], ascending=[True, False])\
                            .reset_index(drop=True) # For removing all exact matches
    
    return matches

def get_fuzzy_matches(old_names: pd.Series, new_names: pd.Series, cosine_match: pd.DataFrame, topn: int = 1, min_score:int = 0) -> pd.DataFrame:
    """Returns the topn best matches for each of the names in vector b
        
        old_names: Existing Names
        new_names: New Names that need Matching
        topn: number of results for each new name
    
    """
    
    fuzzy_matches = pd.DataFrame(columns=['name', 'match', 'match_score'])
    
    res_idx = 0
    for new_name in tqdm(new_names.values, desc="Computing Fuzzy Similarities..."):
        
        # Use subset of cosine names
        most_like_names = cosine_match.loc[((cosine_match['name'] == new_name) & (cosine_match['match_score'] > 50))].match.values
        
        # Check if a there is a subset of best names; otherwise, compare against all old names.
        if len(most_like_names) < 15:
            fuzz_matches = fuzz_process.extract(new_name, old_names.values, scorer=fuzz.token_sort_ratio)
        else:
            fuzz_matches = fuzz_process.extract(new_name, most_like_names, scorer=fuzz.token_sort_ratio)
            
        if len(fuzz_matches) > 0:
            for match in fuzz_matches:
                fuzzy_matches.loc[res_idx] = {'name':new_name, 'match':match[0], 'match_score':match[1]}
                res_idx += 1
        else:
            fuzzy_matches.loc[res_idx] = {'name':new_name, 'match':'', 'match_score':0}
            
    return fuzzy_matches.loc[fuzzy_matches['match_score'] > min_score].sort_values(['name', 'match_score'], ascending=[True, False]).groupby(['name']).head(topn)


def get_most_similar_name(new_names: pd.Series, old_names: pd.Series) -> pd.DataFrame:
    
    # Pre-Process Names for Cosine Similarity
    tf_idf_matrix_new, tf_idf_matrix_old = preprocess_names_tfidf(new_names=new_names, old_names=old_names)
    
    #  Run the optimized cosine similarity function. 
    cosine_matches = cosine_similarity_topn(A=tf_idf_matrix_old, 
                                    B=tf_idf_matrix_new.transpose(), 
                                    topn=50, lower_bound=0)
    
    # Get Best Matches
    # store the  matches into new dataframe called matched_df and printing 10 samples
    cosine_matches = get_matches_df(sparse_matrix=cosine_matches,
                                name_vector_a=old_names, 
                                name_vector_b=new_names,
                                topn=-1)
    
    # Get Fuzzy Matches as Well
    fuzzy_match = get_fuzzy_matches(old_names=pd.Series(old_names), new_names=pd.Series(new_names), 
                                    cosine_match=cosine_matches, topn=1, min_score=0)
    
    
    # Merge them both into one
    matches_df = cosine_matches.groupby(['name']).head(1)\
                .merge(fuzzy_match.groupby(['name']).head(1), on=['name'], suffixes=('_cosine', '_fuzzy'), how='inner')
    matches_df = matches_df.set_index('name')
    
    # Get the best match (Either from fuzzy or cosine)
    matches_df['closest_name'] = [cos_name if cos_val > fuzz_val else fuzz_name for cos_name, cos_val, fuzz_name, fuzz_val in matches_df.values]
    
    return matches_df

## Test
get_most_similar_name(new_names=pd.Series(['EMILIO', 'ASHLEIGH', 'JAVIER', 'MARUGEL', 'HECTOR']), old_names=pd.Series(fn_g_r_norm.index.tolist()))

### Get Probability for Names

In [None]:
def get_fn_probs(fn: pd.Series, fn_probs:pd.DataFrame=fn_g_r_norm) -> pd.DataFrame:
    """Return the probability of first name given race for each first name given."""
    
    
    # Clean Names
    fn_norm = preprocess_names(names=fn)
    
    # Get Closest Name
    fn_matches = get_most_similar_name(new_names=fn_norm, old_names=fn_g_r_norm.index.tolist())
    
    # Find Entries
    first_name_probs = fn_matches[['closest_name']].merge(
            fn_g_r_norm,
            left_on='closest_name',
            right_index=True,
            how='left',
    )        
    
    return first_name_probs.loc[fn_norm]

## Test
get_fn_probs(fn=pd.Series(['Emilio', 'Ashleigh', 'Hector', 'Kiara', 'Nicole', 'Larry']))

In [None]:
def get_surn_probs(sn: pd.Series, sn_probs:pd.DataFrame=r_g_sn_norm) -> pd.DataFrame:
    """Return the probability of race given surname for each sruname given."""

    # Clean Names
    sn_norm = preprocess_names(names=sn)
    
    # Get Closest Name
    sn_matches = get_most_similar_name(new_names=sn_norm, old_names=r_g_sn_norm.index.tolist())
    
    # Find Entries
    surname_probs = sn_matches[['closest_name']].merge(
            sn_probs,
            left_on='closest_name',
            right_index=True,
            how='left',
    )        
    
    return surname_probs.loc[sn_norm]

## Test
get_surn_probs(sn=pd.Series(['Ramos', 'Brock', 'Monzalvo', 'Berk', 'Rubenstein', 'Powell', 'Engel']))

## Get Geo Race Probability Based on ZipCode

In [None]:
def get_zipcode_from_addr(addr: pd.Series) -> pd.DataFrame:
    
    
    
    return zcta_df

In [None]:
def get_geo_probs(addr: pd.Series, ztca_probs:pd.DataFrame=zcta_g_race_norm) -> pd.DataFrame:
    """Return the probability of race given zcta for each address given."""

    # Clean Zipcodes
    addr_norm = preprocess_zctas(zcta=addr)
    
    # Find Entries
    geo_probs = addr_norm.to_frame().merge(
            ztca_probs,
            left_on='zcta5',
            right_index=True,
            how='left',
    )        
    
    return geo_probs

get_geo_probs(addr=pd.Series(['95123', '98136', '72712', '95123', '02109', '02109']))

## Compute Probability using BIFSG

![BIFSG](imgs/bayesianformula.png)

In [None]:
def compute_BIFSG_probs( first_name: pd.Series, 
                    surname: pd.Series,
                    addr: pd.Series,
                    RACE_COLS=RACE_COLS) -> pd.DataFrame:
    """Performs the BIFSG calculation"""
    
    # Get Probabilities
    first_name_probs = get_fn_probs(fn=first_name).reset_index(drop=False)
    sur_probs = get_surn_probs(sn=surname).reset_index(drop=False)
    geo_probs = get_geo_probs(addr=addr).reset_index(drop=False)
    
    # Calculate each of the numerators
    bifsg_numer = (
        first_name_probs.loc[:, RACE_COLS] *
        sur_probs.loc[:, RACE_COLS] *
        geo_probs.loc[:, RACE_COLS]
    )
    
    # Calculate the denominator
    bifsg_denom = bifsg_numer.sum(axis=1)
    
    # Caluclate the bifsg probabilities (each num / denom)
    bifsg_probs = bifsg_numer.div(bifsg_denom, axis=0)
    
    # Build frame from zctas, first names, surnames, and probabilities
    bifsg_data = pd.concat([
        first_name_probs
            .rename(columns={'name': 'first_name', 'closest_name':'closest_first_name'})[['first_name', 'closest_first_name']],
        sur_probs
            .rename(columns={'name': 'surname', 'closest_name':'closest_surname'})[['surname', 'closest_surname']],
        geo_probs['zcta5'].to_frame(),
        bifsg_probs
    ], axis=1)
        
    bifsg_data['predicted_race'] = bifsg_data[RACE_COLS].idxmax(axis=1)
    
    return bifsg_data

In [None]:
test_df = pd.DataFrame(zip(
    ['Emilio', 'Hector', 'Ashleigh', 'Kiara', 'Bryn', 'Nicole', 'Larry', 'Marugel'],
    ['Ramos', 'Monzalvo', 'Brock', 'Sanchez', 'Kirkland', 'Rubenstein', 'Berk', 'Ramos'],
    ['95123', '72712', '98136', '72712', '78701', '02109', '02109', '72712']
), columns = ['first_name', 'surname', 'addr'])
test_df

In [None]:
compute_BIFSG_probs(
    first_name=test_df.first_name,
    surname=test_df.surname,
    addr=test_df.addr
)