In [1]:
import nltk
from textblob import TextBlob
import pandas as pd
import numpy as np
from nltk import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import NMF
import re
from scipy.spatial.distance import pdist, squareform, cdist
import matplotlib.pyplot as plt
from pattern.en import tag
from sklearn.neighbors import KDTree, NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import normalize
import pickle
%matplotlib inline

# Load Business TFIDF Matrix and Measure Similarity to Other Businesses

In [157]:
def cos_cdist(matrix, vector):
    """
    Compute the cosine distances between each row of a matrix and vector.
    """
    v = vector.reshape(1, -1)
    return cdist(matrix, v, 'cosine').reshape(-1)

def get_sim_biz(biz_arr, df_biz, biz_dict, k, biz_name):
    """
    Return a dataframe with the top k matches based on smallest cosine distance between a business
    vector and an array of features for other businesses
    biz_arr: can be tfidf matrix or topic model features for each business
    df_biz: a dataframe with additional business data like name, overview, etc.
    biz_dict: a dictionary that serves as a crosswalk between a business name and its index in the feature array
    k: the number of closest results to return as matches for a business
    biz_name: the name of the business to serve as the basis for finding similar businesses
    """
    cosine_distance = cos_cdist(biz_arr, biz_arr[biz_dict[biz_name]])
    indices = np.argsort(cosine_distance)[:k]
    distance = cosine_distance[indices]
    df = pd.DataFrame(distance, index=indices, columns=['distance'])
    df_nn = df_biz.merge(df, left_index = True, right_index = True)
    return df_nn

In [153]:
#Load business data for those that have overview text >30 characters, are in a US State
#and are identified as a 'Company'
biz = pd.read_csv(
    '/Users/sarah/ds/metis/projects/kojak/USCompanyOverviewDataState.csv', encoding = 'latin-1')
#Load the business TFIDF matrix
biztfidf_arr = pickle.load(open("/Users/sarah/ds/metis/projects/kojak/biztfidf.p", "rb")).toarray()
##Load the NMF business topic matrix
biz_arr = pickle.load(open("/Users/sarah/ds/metis/projects/kojak/biztopic_matrix.p", "rb"))
#Load the the dictionary that maps company names to index values
biz_dict = pickle.load(open("/Users/sarah/ds/metis/projects/kojak/bizname_index_crosswalk.p", "rb"))

In [197]:
#Run calculation to retrun the most similar companies
get_sim_biz(biztfidf_arr, biz, biz_dict, 15, biz_name = 'FanDuel')

Unnamed: 0,company_id,company_name,company_name_normalized,company_overview,distance
24760,c:2640,FanDuel,fanduel,"FanDuel, launched in July 2009, is now the lea...",-2.220446e-16
1339,c:1240,DraftMix,draftmix,On DraftMix fantasy sports fans can sign up an...,0.2496057
44340,c:6996,Sport Interactiva,sport interactiva,Creates fantasy sports games for sports like s...,0.3070522
18649,c:22575,PaperSports,papersports,At PaperSports fantasy sports players can pick...,0.3200894
46887,c:78857,RotoBiz,rotobiz,RotoBiz covers the business of fantasy sports ...,0.322809
15660,c:19841,Fantazzle Fantasy Sports Games,fantazzle fantasy sports games,Fantazzle provides weekly fantasy sports games...,0.3276019
18317,c:22320,BetAgainstMe.com,betagainstme,BetAgainstMe.com is a fantasy sports betting w...,0.3310632
19446,c:231882,Star Fantasy Leagues,star fantasy leagues,"Star Fantasy Leagues provides a daily, weekly,...",0.342968
1448,c:1263,Screaming Sports,screaming sports,Screaming Sports is a social networking servic...,0.3432409
19741,c:233817,Bignoggins Productions,bignoggins productions,Bignoggins Productions was founded by Jerry Sh...,0.3512864


# Create Investor Topics and Measure Similarity to Businesses

In [161]:
def get_sim_entity(entity_arr, biz_arr, df_entity, biz_dict, k, biz_name):
    """
    Return a dataframe with the top k matches based on smallest cosine distance between a business
    vector and an array of the same set of features for investor companies/individuals/other entities
    entity_arr: can be tfidf matrix or topic model features for each entity
    biz_arr: can be tfidf matrix or topic model features for each business
    df_entity: a dataframe with additional entity data like name, overview, etc.
    biz_dict: a dictionary that serves as a crosswalk between a business name and its index in the biz feature array
    k: the number of closest results to return as entity matches for a business
    biz_name: the name of the business to serve as the basis for finding similar entities
    """
    cosine_distance = cos_cdist(entity_arr, biz_arr[biz_dict[biz_name]])
    indices = np.argsort(cosine_distance)[:k]
    distance = cosine_distance[indices]
    df = pd.DataFrame(distance, index=indices, columns=['distance'])
    df_nn = df_entity.merge(df, left_index = True, right_index = True)
    return df_nn

## Use Averaging Across Topics to Get Investor Similarity
Note: for investors - business similarity using scores based on topics as opposed to raw word/n-gram tfidf scores in order to get a more generalized match. The assumption is that you want investors that invest in industries or company types similar to the company of interest as opposed to those who have already made investments in that exact company or very similar companies. Also, given that only ~20% of the companies have a qualifying investor, it seems better to use a more generalized approach here for evaluating similarity.

In [102]:
#Profile data for the investor candidates. Only investors with 5+ investments in our candidate companies are selected
df_inv_prof = pd.read_csv(
    '/Users/sarah/ds/metis/projects/kojak/InvestorProfileData5Inv.csv', encoding = 'latin-1')
#Crosswalk between businesses and investors, used to pull in business tfidf and/or topic scores to be averaged for each
#investor
inv = pd.read_csv(
    '/Users/sarah/ds/metis/projects/kojak/USCompanyInvestorCrosswalk5Inv.csv', encoding = 'latin-1')

In [106]:
inv.head()

Unnamed: 0,investor_id,company_id
0,f:1,c:1088
1,f:1,c:1101
2,f:1,c:1102
3,f:1,c:11042
4,f:1,c:11391


In [107]:
#Append company topic scores to investors
df_inv = inv.merge(pd.DataFrame(biz['company_id']).join(pd.DataFrame(biz_arr)), on = 'company_id')

In [108]:
df_inv.head()

Unnamed: 0,investor_id,company_id,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,f:1,c:1088,0.029734,0,0.000949,0,0,0.003229,0,0.007803,...,6e-06,0.000121,0,0,0,0,0,0.000192,0.01764,0
1,f:17,c:1088,0.029734,0,0.000949,0,0,0.003229,0,0.007803,...,6e-06,0.000121,0,0,0,0,0,0.000192,0.01764,0
2,f:34,c:1088,0.029734,0,0.000949,0,0,0.003229,0,0.007803,...,6e-06,0.000121,0,0,0,0,0,0.000192,0.01764,0
3,f:45,c:1088,0.029734,0,0.000949,0,0,0.003229,0,0.007803,...,6e-06,0.000121,0,0,0,0,0,0.000192,0.01764,0
4,f:4689,c:1088,0.029734,0,0.000949,0,0,0.003229,0,0.007803,...,6e-06,0.000121,0,0,0,0,0,0.000192,0.01764,0


In [110]:
#Average topic scores by investor across all businesses an investor has funded
df_inv_avg = df_inv.groupby('investor_id', as_index = False).mean()

In [114]:
df_inv_avg.head()

Unnamed: 0,investor_id,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,f:1,0.009466,0.001367,0.007716,0.004005,0.000625,0.000566,2.6e-05,0.001698467,0.000278,...,0.001606,0.00066,0.002595,0.000272,0.003327,0.001824,0.000842,0.004356,0.003218,0.001524
1,f:10,0.007431,0.000849,0.002287,0.005946,0.00037,0.002154,4e-06,0.004287758,5.8e-05,...,0.001059,1.8e-05,0.00179,0.001006,0.004873,0.002511,0.000544,0.003217,0.000501,0.000137
2,f:10031,0.003206,0.000656,0.012529,0.010512,2e-05,0.000467,2e-05,9.698595e-06,1.1e-05,...,0.003647,0.002306,0.0,0.000426,0.000211,0.002945,8.1e-05,0.002332,0.000652,8.2e-05
3,f:10050,0.00281,0.002517,0.0,7.1e-05,1.4e-05,0.000629,0.0,2.600111e-05,0.0,...,0.000663,0.0,0.004937,0.0,0.006947,0.0,0.0,0.007888,0.001321,0.0
4,f:1007,0.006196,0.000106,0.000869,0.00997,0.002614,1e-06,3.2e-05,6.429232e-07,0.002904,...,0.000411,0.000162,0.000182,2.2e-05,1.8e-05,0.000268,0.002062,0.000149,0.005364,3e-06


In [135]:
inv_avg_arr = df_inv_avg.iloc[:,1:].as_matrix()

In [162]:
#Run calculation to retrun the most similar investors (i.e. investors who have invested in the most similar companies)
get_sim_entity(inv_avg_arr, biz_arr, df_inv_prof, biz_dict, k = 10, biz_name = 'FanDuel')

Unnamed: 0,investor_id,entity_type,investor_name,investor_name_normalized,overview,twitter_username,domain,city,state_code,region,country_code,num_investments,comp_investments,distance
1363,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.15363
1488,p:81839,Person,Anonymous Investor,anonymous investor,,,,,,unknown,,7,6,0.168943
8,f:10144,FinancialOrg,Softbank Ventures Korea,softbank ventures korea,SoftBank Ventures Korea (SBVK) is a venture ca...,softbankkorea,softbank.co.kr,Seoul,,Seoul,KOR,6,6,0.171041
210,f:1550,FinancialOrg,SK Telecom Ventures,sk telecom ventures,SK Telecom Ventures is the investment arm of [...,,sktvc.com,Sunnyvale,CA,SF Bay,USA,16,11,0.188934
1435,p:63410,Person,Paul Bricault,paul bricault,Paul Bricault is a Venture Partner at Greycrof...,pbricault,,,,unknown,,6,5,0.191321
1184,f:9558,FinancialOrg,Signia Venture Partners,signia venture,Signia Venture Partners is an early stage fund...,SigniaVC,signiaventurepartners.com,Menlo Park,CA,SF Bay,USA,25,24,0.195719
1411,p:53919,Person,Christina Brodbeck,christina brodbeck,"Christina Brodbeck is an angel investor, user ...",,,,,unknown,,5,5,0.197099
1388,p:441,Person,Jarl Mohn,jarl mohn,Jarl Mohn brings extensive media experience to...,,,,,unknown,,16,10,0.220651
208,f:1540,FinancialOrg,TWJ Capital,twj,TWJ Capital makes growth equity investments in...,,twjcapital.com,Stamford,CT,New York,USA,8,6,0.236844
32,f:1056,FinancialOrg,TriplePoint Capital,triplepoint,TriplePoint Capital offers leases and loans to...,,triplepointcapital.com,Menlo Park,CA,SF Bay,USA,33,30,0.241637


In [163]:
# Take a look at all the companies each recommended investor has invested in to sanity check similarity
get_sim_entity(inv_avg_arr, biz_arr, df_inv_prof, biz_dict, k =10, biz_name = 'FanDuel').merge(
    inv, on = 'investor_id').merge(biz, on = 'company_id').sort_values(by = 'distance')

Unnamed: 0,investor_id,entity_type,investor_name,investor_name_normalized,overview,twitter_username,domain,city,state_code,region,country_code,num_investments,comp_investments,distance,company_id,company_name,company_name_normalized,company_overview
0,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:10252,Mob.ly,mob ly,"Mob.ly, formerly Goodrec, is a mobile and onl..."
1,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:1399,GameLayers,gamelayers,GameLayers Inc was a small game design company...
2,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:163875,Tuition.io,tuition io,Tuition.io is a revolutionary new tool for man...
3,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:2553,EVO Media Group,evo media group,"The EVO Media Group, Inc. (""EVO"") is a venture..."
4,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:3576,Intent,intent,[Intent](http://www.intent.com) is a start-up ...
5,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:453,Kongregate,kongregate,Kongregate is a casual gaming social network w...
6,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:65608,Playdek,playdek,"Playdek is a mobile portal, developer and plat..."
9,p:3562,Person,Richard Wolpert,richard wolpert,Richard Wolpert is leading the [new consortium...,,,,,unknown,,8,8,0.153630,c:8467,OGPlanet,ogplanet,OGPlanet is an MMO (Massive Multiplayer Online...
15,p:81839,Person,Anonymous Investor,anonymous investor,,,,,,unknown,,7,6,0.168943,c:80400,Matatena Games,matatena games,"Matatena LLC, based in Houston TX, specializes..."
14,p:81839,Person,Anonymous Investor,anonymous investor,,,,,,unknown,,7,6,0.168943,c:71120,Triples Media,triples media,We are a digital media company that primarily ...
