In [3]:
!pip install fuzzywuzzy



In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [None]:
companies_to = pd.read_csv("../cleaned_data/TO_Previews_Companies.csv", lineterminator='\n')[["CompanyName"]]
companies_hs = pd.read_csv("../cleaned_data/handshake_jobs.csv", lineterminator='\n')[['employer_name']].drop_duplicates()

In [6]:
companies_to

Unnamed: 0,CompanyName
0,Oltmans Construction Co.
1,Humanetics
2,Together Labs
3,The Conair Group
4,FortunaPIX
...,...
42026,AjnaLens
42027,nCore Games
42028,HashCube
42029,99 Ecommerce Experts


In [7]:
companies_hs

Unnamed: 0,employer_name
0,Manifold
2,JPMorgan Chase & Co.
3,IBM
8,Hudson River Trading
9,Five Rings
...,...
37792,Mastery Schools
37795,Intercommunity Action
37799,Camp Olympia
37801,100KIDEAS


In [66]:
match_df = pd.merge(companies_hs, companies_to, left_on="employer_name", right_on="CompanyName")
match_df

Unnamed: 0,employer_name,CompanyName
0,Manifold,Manifold
1,Uncountable,Uncountable
2,Instabase,Instabase
3,Glean,Glean
4,PreVeil,PreVeil
...,...,...
411,Accion,Accion
412,WellPower,WellPower
413,GiveDirectly,GiveDirectly
414,Dutch,Dutch


In [9]:
unmatched_hs = pd.merge(companies_hs, companies_to, left_on="employer_name", right_on="CompanyName", how="left").fillna("")
unmatched_hs = unmatched_hs[unmatched_hs["CompanyName"] == ""][["employer_name"]]
unmatched_hs

Unnamed: 0,employer_name
1,JPMorgan Chase & Co.
2,IBM
3,Hudson River Trading
4,Five Rings
5,Bank of America
...,...
6185,Bryn Mawr College
6186,Mastery Schools
6188,Camp Olympia
6189,100KIDEAS


In [20]:
unmatched_hs = pd.merge(companies_hs, companies_to, how="cross")
unmatched_hs

Unnamed: 0,employer_name,CompanyName
0,Manifold,Oltmans Construction Co.
1,Manifold,Humanetics
2,Manifold,Together Labs
3,Manifold,The Conair Group
4,Manifold,FortunaPIX
...,...,...
260171885,"Robinson Aviation (RVA), Inc.",AjnaLens
260171886,"Robinson Aviation (RVA), Inc.",nCore Games
260171887,"Robinson Aviation (RVA), Inc.",HashCube
260171888,"Robinson Aviation (RVA), Inc.",99 Ecommerce Experts


In [21]:
unmatched_hs['ratio'] = unmatched_hs.apply(lambda row: fuzz.ratio(row['employer_name'], row['CompanyName']), axis=1)

In [22]:
unmatched_hs

Unnamed: 0,employer_name,CompanyName,ratio
0,Manifold,Oltmans Construction Co.,25
1,Manifold,Humanetics,33
2,Manifold,Together Labs,10
3,Manifold,The Conair Group,25
4,Manifold,FortunaPIX,11
...,...,...,...
260171885,"Robinson Aviation (RVA), Inc.",AjnaLens,11
260171886,"Robinson Aviation (RVA), Inc.",nCore Games,10
260171887,"Robinson Aviation (RVA), Inc.",HashCube,5
260171888,"Robinson Aviation (RVA), Inc.",99 Ecommerce Experts,8


In [23]:
unmatched_hs.to_csv("unmatched_hs_all_ratios.csv")

In [48]:
mini = unmatched_hs.iloc[0:100000]

In [50]:
max_ratio = unmatched_hs.groupby("employer_name").agg({'ratio': max}).reset_index()
max_ratio = pd.merge(max_ratio, unmatched_hs, on=["employer_name", "ratio"])
max_ratio

Unnamed: 0,employer_name,ratio,CompanyName
0,1 Atelier LLC,64,Advenser LLC
1,100KIDEAS,57,10xDS
2,1047 Games,100,1047 Games
3,121 Concepts NYC,67,GO Concepts
4,137 Ventures,100,137 Ventures
...,...,...,...
8145,tutor@air-class.com,55,iTutor.com
8146,wikiHow,100,wikiHow
8147,www.teachenglishinkorea.org,47,Tango Publishing
8148,“A Platform that Shares!”- TripleE,51,Platform Ventures


In [52]:
max_ratio.to_csv("max_ratio_matches_companies.csv")

In [56]:
fuzzy_matches = max_ratio[max_ratio["ratio"] > 90]
fuzzy_matches = fuzzy_matches[fuzzy_matches["ratio"] < 100]
fuzzy_matches.to_csv("max_ratio_matches_90_99.csv")

In [None]:
# From here, I manually evaluated the fuzzy matches -- many were not true matches

In [64]:
# Finally, create the full table of matching TO and HS company names
fuzzy_matches_manual = pd.read_csv("../cleaned_data/fuzzy_matches_manual.csv")
fuzzy_matches_manual

Unnamed: 0.1,Unnamed: 0,employer_name,CompanyName,ratio
0,477,"Analytical Mechanics Associates, Inc",Analytical Mechanics Associates,93
1,824,Baptist Children's Home of North Carolina,Baptist Children's Homes of North Carolina,99
2,835,"Barton & Loguidice, D.P.C.","Barton & Loguidice, P.C.",96
3,1163,C16 Biosciences,c16 Biosciences,93
4,1412,"Carney, Sandoe & Associates",Carney Sandoe & Associates,98
5,1755,Coding with Kids,Coding With Kids,94
6,2074,DMC Inc.,"DMC, Inc.",94
7,2547,"Estuate, Inc","Estuate, Inc.",96
8,2643,"Farallon Capital Management, LLC",Farallon Capital Management,92
9,3032,Goliath Technologies LP,Goliath Technologies,93


In [71]:
full_matches = pd.concat([fuzzy_matches_manual, match_df]).fillna(100).rename(columns={'employer_name': 'CompanyNameHS', 'CompanyName': 'CompanyNameTO'})
full_matches = full_matches[['CompanyNameHS', 'CompanyNameTO']]
full_matches


Unnamed: 0,CompanyNameHS,CompanyNameTO
0,"Analytical Mechanics Associates, Inc",Analytical Mechanics Associates
1,Baptist Children's Home of North Carolina,Baptist Children's Homes of North Carolina
2,"Barton & Loguidice, D.P.C.","Barton & Loguidice, P.C."
3,C16 Biosciences,c16 Biosciences
4,"Carney, Sandoe & Associates",Carney Sandoe & Associates
...,...,...
411,Accion,Accion
412,WellPower,WellPower
413,GiveDirectly,GiveDirectly
414,Dutch,Dutch


In [72]:
full_matches.to_csv("../cleaned_data/HS_TO_company_names_conversion.csv")

## Merge and fix TO company names to match handshake ones



In [13]:
full_matches = pd.read_csv("../cleaned_data/HS_TO_company_names_conversion.csv", lineterminator='\n').drop(columns=["Unnamed: 0"])
company_to = pd.read_csv("../cleaned_data/to_companies.csv", lineterminator='\n').drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
company_to = pd.merge(company_to, full_matches, left_on="CompanyName", right_on="CompanyNameTO")
company_to["CompanyName"] = company_to.apply(lambda row: row["CompanyNameHS"] if row["CompanyNameHS"] is not None else row["CompanyName"], axis=1)
company_to.to_csv("../cleaned_data/to_companies_names_fixed.csv")
company_to


Unnamed: 0,CompanyName,twitterUrl,linkedInUrl,facebookUrl,websiteUrl,city,country,locationString,employeeSizeRange,description,state,CompanyNameHS,CompanyNameTO
0,Bowery Farming,https://twitter.com/BoweryFarming,https://www.linkedin.com/company/bowery-farming,https://facebook.com/boweryfarming,http://boweryfarming.com,New York,United States,"New York, New York, United States",200-500,Bowery's headquarters are based in Manhattan. ...,,Bowery Farming,Bowery Farming
1,FiscalNote,https://twitter.com/FiscalNote,http://www.linkedin.com/company/fiscalnote,https://facebook.com/fiscalnote,http://www.fiscalnote.com,Washington,United States,"Washington, District of Columbia, United States",200-500,FiscalNote is the premier information services...,,FiscalNote,FiscalNote
2,Allbirds,https://twitter.com/allbirds,https://linkedin.com/company/allbirds/,https://facebook.com/weareallbirds,https://www.allbirds.com/,San Francisco,United States,"San Francisco, California, United States",200-500,Allbirds is a San Francisco-based startup aime...,,Allbirds,Allbirds
3,"GDS Associates, Inc.",https://twitter.com/GDSassociates,https://www.linkedin.com/company/gds-associate...,https://www.facebook.com/gdsenergyconsultants,https://www.gdsassociates.com/,Marietta,United States,"Marietta, Georgia, United States",200-500,"GDS Associates, Inc. Engineers and Consultants...",,"GDS Associates, Inc.","GDS Associates, Inc."
4,Cooper Carry,https://twitter.com/CooperCarry,https://www.linkedin.com/company/cooper-carry,https://www.facebook.com/pages/Cooper-Carry-Th...,https://www.coopercarry.com/,Atlanta,United States,"Atlanta, Georgia, United States",200-500,"Cooper Carry is one of America's most dynamic,...",,Cooper Carry,Cooper Carry
...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Aldrich Capital Partners,https://twitter.com/AldrichCapital,https://linkedin.com/company/aldrich-capital-p...,https://facebook.com/1469198223358728,http://aldrichcap.com,Vienna,United States,"Vienna, Virginia, United States",10-50,Aldrich Capital Partners (“ACP”) is a middle-m...,,Aldrich Capital Partners,Aldrich Capital Partners
438,Dutch,https://twitter.com/DutchPetHealth,https://www.linkedin.com/company/dutchpet,http://facebook.com/dutchpet/,https://www.dutch.com,Oakland,United States,"Oakland, California, United States",10-50,Dutch is revolutionizing the American veterina...,,Dutch,Dutch
439,Dreamhaven,https://twitter.com/DreamhavenEnt,https://www.linkedin.com/company/dreamhaven/,,https://www.dreamhaven.com/,Irvine,United States,"Irvine, California, United States",10-50,Dreamhaven is a new video game developer and p...,,Dreamhaven,Dreamhaven
440,1047 Games,https://twitter.com/Splitgate,https://www.linkedin.com/company/1047games/,,http://1047games.com,Zephyr Cove,United States,"Zephyr Cove, Nevada, United States",10-50,1047 Games is an independent development studi...,,1047 Games,1047 Games
