# Brainstorm

In [2]:
import pandas as pd
from collections import defaultdict, Counter
import json
import numpy as np

funding = pd.read_csv('clean_data.csv')
funding

Unnamed: 0,region,coordinates,fundedorg_name,fundedorg_description,fundedorg_short_description,fundedorg_categories,fundedorg_category_groups,fundedorg_permalink,announced_on,funding_type,series,money_raised_usd,n_investors,investor_name,investor_type,investor_investor_type,investor_description,investor_short_description,uuid
0,"Amsterdam, Noord-Holland, The Netherlands","(52.36999, 4.8919)",By Rocket Box,By Rocket Box is a startup that creates Welcom...,Coming Soon,[],[],rocket-box-group,2018-10-28,private_equity,,,1,,,,,,007229b41a197ce1f42bc0e481a85861
1,"Ghent, Oost-Vlaanderen, Belgium","(51.05, 3.71667)",Wetime,Wetime is a social platform that engages coupl...,Wetime is the bridge builder for couples and f...,"['Social Network', 'Dating', 'B2C', 'Communiti...","['Internet Services', 'Community and Lifestyle...",wetime,2018-10-20,seed,,,0,,,,,,ebe752d1cc32497faf208f52e82a208d
2,"Nairobi, Nairobi Area, Kenya","(-1.28333, 36.81667)",ImpalaCoin,We are Building a trade finance Crypto bank o...,Digital Currency Kenya,[],[],impalacoin,2018-10-20,initial_coin_offering,,,0,,,,,,c9994563a56e4b9fb3204f1a0b5db7c3
3,"Boston, Massachusetts, United States","(42.35843, -71.05977)",MTonomy,Future of digital media: \n\n- Scalable decent...,"Blockchain infrastructure to license, distribu...","['Blockchain', 'Media and Entertainment', 'Int...","['Media and Entertainment', 'Internet Services']",mtonomy,2018-10-20,undisclosed,,,1,,,,,,7d89c23af1a04235870605141a0d751e
4,"Philadelphia, Pennsylvania, United States","(39.95233, -75.16379)",Simply Good Jars,,Eat well | Feel Good | Waste Less,['Organic Food'],['Food and Beverage'],simply-good-jars,2018-10-20,pre_seed,,350000.0,2,Investor's Circle,Organization,angel_group,"Investors' Circle is the oldest, largest and m...","Investors' Circle is the oldest, largest and m...",19e3767d62d34c6487c72901dfcb2780
5,"Denver, Colorado, United States","(39.73915, -104.9847)",Sustain,,Sustain rewards and incentivize employees to c...,[],[],sustain-2,2018-10-18,pre_seed,,,0,,,,,,1db16dc217ea481d9d6101f32fa49660
6,"Tokyo, Tokyo, Japan","(35.6895, 139.69171)",Meltin MMI,Meltin MMI is a cyborg-technology company that...,Meltin MMI is a cyborg-technology company that...,"['Robotics', 'Health Care', 'Biotechnology']","['Hardware', 'Science and Engineering', 'Softw...",meltin-mmi,2018-10-17,venture,B,17974941.0,3,Dainippon Sumitomo Pharma,Organization,,"""Green Prism"", the symbol of Dainippon Sumitom...",Dainippon Sumitomo Pharma is a pharmaceutical ...,f0877e36d40c4d628c35c6769cec12e3
7,"Tel Aviv, Tel Aviv, Israel","(32.08088, 34.78057)",Cognata,The fast lane to Autonomous Driving.\n\nAutono...,Cognata provides driving validation platform f...,"['Autonomous Vehicles', 'Automotive', 'Software']","['Transportation', 'Transportation', 'Software']",cognata,2018-10-17,venture,B,18500000.0,5,Scale Venture Partners,Organization,venture_capital,Scale Venture Partners invests in software com...,Scale Venture Partners is a VC firm funding th...,ee565296d58944eaaf23c7342100041b
8,"Jakarta, Jakarta Raya, Indonesia","(-6.21462, 106.84513)",Crowde,Crowde is a crowd-investing platform that enab...,Crowde is a crowd-investing platform that enab...,"['AgTech', 'Social Entrepreneurship', 'Crowdfu...","['Agriculture and Farming', 'Community and Lif...",crowde,2018-10-17,seed,,,2,GREE Ventures,Organization,venture_capital,GREE Ventures focuses on investing in early st...,GREE Ventures invests in early stage (Seed to ...,e8c430e2b8d7453c9f2708d27b4efcab
9,"New York, New York, United States","(40.71427, -74.00597)",WhiteSource,WhiteSource becomes part of your software deve...,WhiteSource empowers businesses to develop bet...,"['Open Source', 'Developer Tools', 'Enterprise...","['Software', 'Software', 'Software']",white-source,2018-10-17,venture,C,35000000.0,3,Susquehanna Growth Equity,Organization,private_equity_firm,"Susquehanna Growth Equity, LLC is a private eq...",SGE invests in growth stage technology compani...,e873cf7cc388495b8ac7c80306fb9e05


In [167]:
CATEGORY_CONVERSIONS = {
    '"Women\'s"': 'Women\'s',
    '"Men\'s"': 'Men\'s',
}

def clean_category_list(string):
    categories = []
    clean = string.lstrip('[').rstrip(']').split(', ')
    for i in clean:
        if i == '':
            continue
        if i in CATEGORY_CONVERSIONS:
            i = CATEGORY_CONVERSIONS[i]
        categories.append(i.strip('\''))
    return categories

def get_category_counts(df):
    rows = df['fundedorg_categories']
    counts = defaultdict(int)
    for _, row in df.iterrows():
        if "United States" not in row['region']:
            continue
        categories = clean_category_list(row['fundedorg_categories'])
        for c in categories:
            counts[c] += 1
    
    #return counts
    # Eliminate all categories that don't have at least 50 companies
    # in them from our dataset.
    return counts#dict((k, v) for k, v in counts.iteritems() if v >= 50)

def get_categories(df):
    counts = get_category_counts(df)
    return counts.keys()

In [168]:
categories = get_categories(funding)

In [169]:
counts = get_category_counts(funding)

In [170]:
def none_in_dictionary(l, d):
    missing = 0
    for item in l:
        if item not in d:
            missing += 1
    return missing == len(l)

cities = defaultdict(int)
for _, row in funding.iterrows():
    city = row['region']
    if "United States" not in city:
        continue
    categories = clean_category_list(row['fundedorg_categories'])
    if none_in_dictionary(categories, counts):
        continue
    cities[city] += 1
    
cities = dict((k, v) for k, v in cities.iteritems() if v >= 20)

total_companies = sum(cities.values())

In [171]:
city_lists = defaultdict(list)
for c, _ in counts.iteritems():
    df = funding.loc[funding['fundedorg_categories'].str.contains(c, regex=False)]
    for _, row in df.iterrows():
        categories = clean_category_list(row['fundedorg_categories'])
        if c not in categories:
            continue
        city = row['region']
        if city not in cities:
            continue
        city_lists[city].append(c)
            
category_ratios = defaultdict(lambda: defaultdict(float))
for city, _ in city_lists.iteritems():
    for category, count in Counter(city_lists[city]).most_common():
        if float(count) < 10:
            continue
        percent_in_city = float(count) / cities[city]
        total_percent = float(counts[category]) / total_companies
        category_ratios[city][category] = percent_in_city / total_percent

print category_ratios

defaultdict(<function <lambda> at 0x10d128398>, {'New York, New York, United States': defaultdict(<type 'float'>, {'Pharmaceutical': 0.7131332492236024, 'Commercial Real Estate': 2.3665458937198065, 'Travel': 1.4964107524691985, 'Blockchain': 1.3639698008138788, 'Sports': 1.5874966243586282, 'Enterprise Software': 0.7740206494497113, 'Consumer Goods': 1.4383681535855448, 'Insurance': 1.1807499768239547, 'Internet': 0.8298277809147375, 'Hospitality': 1.9777562111801241, 'Risk Management': 2.2538532321141016, 'Medical Device': 0.35796492510047495, 'Social Media': 1.3277244494635798, 'Transportation': 0.7355982526789981, 'Finance': 1.3268925419137907, 'Beauty': 2.098415078175198, 'E-Commerce': 1.54901185770751, 'Marketing': 1.5806243445994999, 'Media and Entertainment': 1.2170807453416148, 'SaaS': 0.9856639839034205, 'Digital Media': 1.2170807453416148, 'Retail Technology': 2.15524715320911, 'Information Services': 0.9736645962732919, 'Wellness': 1.217080745341615, 'Internet of Things': 0

In [172]:
json_list = []
for city, category_ratio in category_ratios.iteritems():
    if "United States" not in city:
        continue
    #print Counter(category_ratios[city]).most_common()[:10]
    max_ratios = Counter(category_ratios[city]).most_common()[0:5]
    top_categories = [i[0] for i in max_ratios]
    #number_of_companies_in_category = Counter(city_lists[city])[top_category]
    
    # Perform one last filter on our data. If there are fewer than 7 companies
    # in the top category for a city, eliminate it.
    #if number_of_companies_in_category < 7:
    #    continue
    print city, top_categories#y, number_of_companies_in_category, "{0:.0%}".format(float(number_of_companies_in_category) / cities[city])
    #entry = {"city": city, "category": top_category, "numberOfCompanies": number_of_companies_in_category}
    #json_list.append(entry)

#print json_list
print len(json_list)
    
#with open('data.json', 'w') as outfile:
    #json.dump(json_list, outfile)

New York, New York, United States ['Property Management', 'Fashion', 'Commercial Real Estate', 'Risk Management', 'Retail Technology']
San Diego, California, United States ['Therapeutics', 'Biotechnology', 'Medical', 'Health Care', 'SaaS']
Denver, Colorado, United States ['Information Technology', 'Software', 'Health Care']
Philadelphia, Pennsylvania, United States ['Health Care']
Atlanta, Georgia, United States ['Health Care']
Minneapolis, Minnesota, United States ['Medical Device', 'Health Care', 'Biotechnology']
San Francisco, California, United States ['PaaS', 'Computer Vision', '3D Technology', 'Autonomous Vehicles', 'Virtual Reality']
Boston, Massachusetts, United States ['Therapeutics', 'Health Diagnostics', 'Medical Device', 'Life Science', 'Biotechnology']
Chicago, Illinois, United States ['Internet', 'Enterprise Software', 'Information Technology', 'Financial Services', 'SaaS']
Seattle, Washington, United States ['Computer', 'E-Commerce', 'Internet', 'Machine Learning', 'Ente