### Data Quality - Creating Alias Network

__Objective:__  
In the real world, human error or inconsistancies causes our data quality to drop. There are scenarios where brands or names of companies are spelled incorrectly or differently. Can we come up with a way to consolidate these data points and create an alias network?

In [167]:
from fuzzywuzzy import fuzz
import re
import pandas as pd
import itertools

# Example of brands
brands = ['Hersheys', 'hersheys', 'Hersheys Corp', 'Hershies',
          'Coca Cola', 'coca-cola', 'Coca-Cola',' CocaCola.',
          'Pepsi','Pepsi Cola','Pepsi-Cola',
          'Kit-Kat', 'Kit Kat', 'kitkat',' kit-kat']

brands = pd.DataFrame(brands, columns = ['Brand'])

# Data Cleanup
# Reduce everything to lowercase
brands_clean = [i.lower() for i in brands.Brand]

# Remove all special characters except white spaces
brands_clean = [re.sub('[^a-zA-Z0-9]+', ' ', i) for i in brands_clean]

# Storing back into dataframe
brands_clean = pd.DataFrame(brands_clean, columns = ['Brand_Clean'])
brands = pd.concat([brands, brands_clean], axis=1)

# Display
brands

Unnamed: 0,Brand,Brand_Clean
0,Hersheys,hersheys
1,hersheys,hersheys
2,Hersheys Corp,hersheys corp
3,Hershies,hershies
4,Coca Cola,coca cola
5,coca-cola,coca cola
6,Coca-Cola,coca cola
7,CocaCola.,cocacola
8,Pepsi,pepsi
9,Pepsi Cola,pepsi cola


### Creating Alias Network

In [168]:
# Init alias network
alias_network = []

for i in brands.Brand_Clean:
    
    iter_matches = []
    
    for j in brands.Brand_Clean:
        
        # Compting Distances
        ratio = fuzz.ratio(i,j)
        partial_ratio = fuzz.partial_ratio(i,j)
        token_sort_ratio = fuzz.token_sort_ratio(i,j)
        
        ensembled_score = (ratio + partial_ratio + token_sort_ratio) / 3
        
        # If the score is 70 or above, we consider this a match
        if ensembled_score >= 70:
            iter_matches.append(j)
          
    # Appending to the final list
    alias_network.append(iter_matches)

# Removing duplicates in alias network
alias_network.sort()
alias_network = list(alias_network for alias_network,_ in itertools.groupby(alias_network))

# Creating cluster names
clusters = []
for i in range(len(alias_network)):
    clusters.append('Cluster ' + str(i))
    
alias_network = dict(zip(clusters, alias_network))
alias_network

{'Cluster 0': ['coca cola', 'coca cola', 'coca cola', ' cocacola '],
 'Cluster 1': ['hersheys', 'hersheys', 'hersheys corp', 'hershies'],
 'Cluster 2': ['kit kat', 'kit kat', 'kitkat', ' kit kat'],
 'Cluster 3': ['pepsi', 'pepsi cola', 'pepsi cola']}

### Applying Alias Network

In [169]:
# Assingn the clusters
assign_clusters = []
for b in brands.Brand_Clean:
    for c in list(alias_network.keys()):
        if b in alias_network[c]:
            assign_clusters.append(c)
        else:
            pass

brands['Cluster'] = assign_clusters
brands

Unnamed: 0,Brand,Brand_Clean,Cluster
0,Hersheys,hersheys,Cluster 1
1,hersheys,hersheys,Cluster 1
2,Hersheys Corp,hersheys corp,Cluster 1
3,Hershies,hershies,Cluster 1
4,Coca Cola,coca cola,Cluster 0
5,coca-cola,coca cola,Cluster 0
6,Coca-Cola,coca cola,Cluster 0
7,CocaCola.,cocacola,Cluster 0
8,Pepsi,pepsi,Cluster 3
9,Pepsi Cola,pepsi cola,Cluster 3
