In [28]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt 
import time

from collections import Counter
from collections import defaultdict
from wordcloud import WordCloud

## Read Crunchbase Data

In [29]:
df = pd.read_csv('crunchbase_data/crunchbase_blockchain_companies.csv', encoding='utf-8')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50) # Show full text
df

Unnamed: 0,Organization Name,Categories,Description,Founded Date,Estimated Revenue Range,Operating Status,Exit Date,Closed Date,Full Description,Category Groups,Website
0,Kraken,"Asset Management, Blockchain, Cryptocurrency, ...",Kraken is a cryptocurrency exchange that provi...,"Jul 28, 2011",$50M to $100M,Active,—,—,Our mission is to accelerate the adoption of c...,"Financial Services, Lending and Investments, P...",www.kraken.com/
1,Circle,"Banking, Blockchain, Cryptocurrency, Finance, ...","Circle is a global internet finance company, b...",2013,$10M to $50M,Active,—,—,Circle provides an online platform that enable...,"Financial Services, Lending and Investments, P...",www.circle.com/en/
2,Ripple,"Blockchain, Cryptocurrency, Financial Services...",Ripple provides one frictionless experience to...,2012,$10M to $50M,Active,—,—,Ripple provides one frictionless experience to...,"Financial Services, Internet Services, Payment...",ripple.com
3,High Fidelity,"Blockchain, Enterprise Software, Hardware, Sof...",High Fidelity is an open source software for s...,"Apr 1, 2013",$1M to $10M,Active,—,—,They believe that both the hardware and the in...,"Hardware, Software",highfidelity.com/
4,Elliptic,"Bitcoin, Blockchain, Finance",Elliptic makes cryptocurrency transaction acti...,"Oct 1, 2013",$1M to $10M,Active,—,—,Elliptic makes cryptocurrency transaction acti...,"Financial Services, Payments, Software",www.elliptic.co/
...,...,...,...,...,...,...,...,...,...,...,...
5438,Apāto,"Blockchain, Real Estate, Software",Decentralised real-estate ownership through di...,2019,—,Active,—,—,—,"Real Estate, Software",apato.company
5439,NadiFin,"Artificial Intelligence, Blockchain, FinTech, ...",NadiFin is a FinTech accelerator program for c...,2019,—,Active,—,—,NadiFin is a FinTech accelerator program for c...,"Artificial Intelligence, Data and Analytics, F...",www.nadifin.com/
5440,Anique,"Art, Blockchain",Anique is a service that utilizes blockchain t...,"Mar 12, 2019",—,Active,—,—,—,Media and Entertainment,anique.jp/
5441,.nanobile,"Artificial Intelligence, Blockchain, Communica...","Internet of Things, Artificial Intelligence an...","Mar 21, 2019",—,Active,—,—,"nanobile is the Internet of things (IoT), Arti...","Artificial Intelligence, Data and Analytics, H...",nanobile.com


### Validate the dataset

In [30]:
# Note: about 100 points were omitted due to API call results being inconsistent. Check that the data has important companies:
names_in_df = [name.lower().strip() for name in df['Organization Name'].tolist()]

names_to_find = ['cryptokitties', 'consensys', 'pixelplex', 'limechain']
for name in names_to_find:
    if name in names_in_df:
        print(name + ' found!')
    else:
        print(name + ' not found...')

# Check startups on the Master List
masterlist_startups = None
with open('masterlist_data/blockchain_startup_names.txt', 'r') as f:
    masterlist_startups = [line.lower().strip() for line in f]
    
found = []
for name in names_in_df:
    if name in masterlist_startups:
        found.append(name)
        
print('\n%d out of %d companies found in master list:' % (len(found), len(masterlist_startups)))
print(found)

cryptokitties found!
consensys not found...
pixelplex not found...
limechain found!

41 out of 111 companies found in master list:
['circle', 'ripple', 'coinbase', 'veem', 'provenance', 'omisego', 'bigchaindb', 'sensay', 'hive blockchain', 'crowdz', 'epiphyte', 'monax', 'global blockchain', 'cashaa', 'propy', 'parity technologies', 'bloq', 'aeternity', 'decent', 'dfinity', 'lisk', 'humaniq', 'hyperledger', 'chain of things', 'backfeed', 'trusttoken', 'shipchain', 'latoken', 'minthealth', 'connectjob', 'vechain', 'icobox', 'luna', 'blockmedx', 'herosphere', 'horizon state', 'decent', 'colony', 'cypherium', 'bitse', 'openchain']


### Keep only the relevant columns

In [31]:
# Filter columns
df = df[['Organization Name','Categories', 'Description', 'Founded Date', 'Estimated Revenue Range', 'Operating Status', 'Exit Date', 'Closed Date', 'Full Description', 'Category Groups', 'Website']]
df

Unnamed: 0,Organization Name,Categories,Description,Founded Date,Estimated Revenue Range,Operating Status,Exit Date,Closed Date,Full Description,Category Groups,Website
0,Kraken,"Asset Management, Blockchain, Cryptocurrency, ...",Kraken is a cryptocurrency exchange that provi...,"Jul 28, 2011",$50M to $100M,Active,—,—,Our mission is to accelerate the adoption of c...,"Financial Services, Lending and Investments, P...",www.kraken.com/
1,Circle,"Banking, Blockchain, Cryptocurrency, Finance, ...","Circle is a global internet finance company, b...",2013,$10M to $50M,Active,—,—,Circle provides an online platform that enable...,"Financial Services, Lending and Investments, P...",www.circle.com/en/
2,Ripple,"Blockchain, Cryptocurrency, Financial Services...",Ripple provides one frictionless experience to...,2012,$10M to $50M,Active,—,—,Ripple provides one frictionless experience to...,"Financial Services, Internet Services, Payment...",ripple.com
3,High Fidelity,"Blockchain, Enterprise Software, Hardware, Sof...",High Fidelity is an open source software for s...,"Apr 1, 2013",$1M to $10M,Active,—,—,They believe that both the hardware and the in...,"Hardware, Software",highfidelity.com/
4,Elliptic,"Bitcoin, Blockchain, Finance",Elliptic makes cryptocurrency transaction acti...,"Oct 1, 2013",$1M to $10M,Active,—,—,Elliptic makes cryptocurrency transaction acti...,"Financial Services, Payments, Software",www.elliptic.co/
...,...,...,...,...,...,...,...,...,...,...,...
5438,Apāto,"Blockchain, Real Estate, Software",Decentralised real-estate ownership through di...,2019,—,Active,—,—,—,"Real Estate, Software",apato.company
5439,NadiFin,"Artificial Intelligence, Blockchain, FinTech, ...",NadiFin is a FinTech accelerator program for c...,2019,—,Active,—,—,NadiFin is a FinTech accelerator program for c...,"Artificial Intelligence, Data and Analytics, F...",www.nadifin.com/
5440,Anique,"Art, Blockchain",Anique is a service that utilizes blockchain t...,"Mar 12, 2019",—,Active,—,—,—,Media and Entertainment,anique.jp/
5441,.nanobile,"Artificial Intelligence, Blockchain, Communica...","Internet of Things, Artificial Intelligence an...","Mar 21, 2019",—,Active,—,—,"nanobile is the Internet of things (IoT), Arti...","Artificial Intelligence, Data and Analytics, H...",nanobile.com


# Data Exploration

### See Datapoints based on Founded Year

In [32]:
# Create separate dfs by "Founded Year"
pd.options.mode.chained_assignment = None  # default='warn'
founded_years = []
for i in range(len(df)):
    year = df.loc[i, 'Founded Date'].split(' ')[-1].strip()
    if year != '—':
        assert(year[:2] == '20' or year[:2] == '19' or year[:2] == '18')
        assert(len(year) == 4)
        founded_years.append(year)
    else:
        founded_years.append(None)
        
df['Founded Year'] = founded_years

dfs_by_year = []
unique_years = sorted(list(set(filter(None, founded_years))))

# Only count companies founded after 2008 (blockchain startups not possible before then)
unique_years = list(filter(lambda year: int(year) >= 2008, unique_years))

for year in unique_years:
    year_df = df[df['Founded Year'] == year]
    
    dfs_by_year.append(year_df)
    
#df = df.drop(columns=['Founded Year'])
    
print('%d years found in data are >= 2008' % len(dfs_by_year))
print('Number of companies in each year:')
for d in dfs_by_year:
    print('%d ' % len(d), end='')

13 years found in data are >= 2008
Number of companies in each year:
25 34 47 56 67 149 228 282 520 1504 1295 196 9 

In [33]:
pd.set_option('display.max_colwidth', None) # Show full text
dfs_by_year[0]

Unnamed: 0,Organization Name,Categories,Description,Founded Date,Estimated Revenue Range,Operating Status,Exit Date,Closed Date,Full Description,Category Groups,Website,Founded Year
96,CloudCover,"Blockchain, Cyber Security, Insurance",AI-SOAR cyber security orchestration automation risk platform providing sub-second protection that enables cyber data insurance on-demand.,"Jan 11, 2008",Less than $1M,Active,—,—,"CloudCover is a full-spectrum cyber security platform utilizing AI machine learning to deliver real time risk aware protection and data security control. Our platform employs network risk analytics including risk score that enables underwriting cybersecurity data insurance on-demand, incrementally.\n\nCloudCover deploys as an AI-enhanced Security Orchestration Automation Risk Response (SOAR) network solution operating in microsecond speed at over nine-nines (99.9999999%) accuracy.\n\nOur CC/B1 extends onto a customer’s network as real-time sensors and therein utilizing AI/ML is capable of detecting-anticipating known, unseen and previously unknown threats at the network computing edge. The CC/B1 is module-based and may be custom configured into practically any network enterprise, telco and/or edge computing security device.\n\nSince inception, our CyberSafety Platform represents the most advance cybersecurity solution in the market today. CloudCover’s technology portfolio has integrated artificial intelligent systems and methods including blockchain technology that will revolutionize the unique protection that represent our cyber safety platform. We are transforming traditional data security and privacy methods into real time insured cyber safe, compliant ecosystems.","Financial Services, Information Technology, Privacy and Security",cloudcover.cc,2008
134,CloseCross,"Blockchain, Internet, Software",CloseCross is the first fully decentralized derivatives platform.,2008,—,Active,—,—,CloseCross opens up the massive financial derivatives market to the public giving them easy access to peer-to-peer derivative trading platform without any need for deep financial knowledge.,"Internet Services, Software",closecross.com/,2008
150,InXero,"Blockchain, Digital Marketing, Information Technology, Marketing, SaaS, Sales Automation, Social Media Marketing",InXero is industry’s first Sales Enablement and Go-to-Market platform for Solution Providers,2008,—,Active,—,—,"75% of world trade runs through solution providers / resellers. Yet taking new products and solutions to market remains massively complicated, highly manual, and risk-prone. There has been no simple, efficient and integrated way for resellers to market and sell their solutions. Until now.\n\nAt InXero, our mission is to simplify the entire go-to-market process so that resellers can move fast and sell solutions effortlessly - all with the latest industry intelligence, vendor content and digital reach. That’s why we have delivered the InXero platform - the industry’s first Sales Enablement and Go-to-Market platform for marketing and selling of single- and multi-vendor solutions for unique buyer needs. With InXero, your team will have all the benefits of close collaboration with your buyer, vendor and distributor partners while gaining the speed and agility needed to win in this fast-paced, dynamic world.","Information Technology, Sales and Marketing, Software",www.inxero.com,2008
215,SIX Swiss Exchange,"Blockchain, Payments",SIX Swiss Exchange is a stock exchange company.,2008,$10M to $50M,Active,—,—,—,"Financial Services, Payments",www.six-swiss-exchange.com/index.html,2008
221,Bitcoin.com,"Bitcoin, Blockchain, Cryptocurrency, Finance, Mining, Payments, Virtual Currency",Bitcoin.com is the premier source for everything Bitcoin related.,2008,—,Active,—,—,"Bitcoin.com is your premier source for everything Bitcoin related. They can help you buy Bitcoin and open a secure Bitcoin wallet. You can also read the latest news or engage with the community on the Bitcoin Forum. Please keep in mind that this is a commercial website that lists wallets, exchanges and other Bitcoin related companies and information.","Financial Services, Natural Resources, Payments, Software",bitcoin.com,2008
233,ArcTouch,"Android, Apps, Augmented Reality, Blockchain, Digital Marketing, Internet of Things, iOS, Smart Home, Software",We design lovable apps and develop technology for the connected world.,2008,—,Active,"Jan 27, 2016",—,"We're the app developers and designers behind well known products from hundreds of Fortune 500 companies, world-class brands, and influential startups. We specialize in app design and app development for iOS, Android, and Xamarin. We also help companies with their Alexa skills, web sites, back-end APIs, blockchain development, augmented reality experiences, and IoT smart products.","Apps, Consumer Electronics, Hardware, Internet Services, Mobile, Platforms, Real Estate, Sales and Marketing, Software",arctouch.com,2008
235,LivaRava,"Blockchain, Knowledge Management, PaaS",Probably is the best tool for knowledge management and sharing.,2008,Less than $1M,Active,—,—,"LivaRava is a web-based knowledge management system that allows its users to systematize and organize information such as text, images, audios, videos, and others. It also enables its users to share information with their friends.\n\nLivaRava was launched in 2008 and is operated from Nevada, United States.",Software,www.livarava.com,2008
264,NEOPLY,"Angel Investment, Blockchain, Consulting, Finance, Incubators, Information Technology, Internet, Venture Capital",NEOPLY is a startup accelerator based in South Korea.,"Apr 1, 2008",—,Active,—,—,"NEOPLY, a Seoul-based startup accelerator, is focused on investing in South Korean startups. \nNEOPLY provides mentoring, angel-funding and workspace for startups . NEOPLY has incubated more than 40 startups since 2008.\nNEOPLY's first exit was with SundayToz, set to be listed on the KOSDAQ in 2013.","Financial Services, Information Technology, Internet Services, Lending and Investments",www.neoply.com,2008
266,Bitcoin Group SE,"Blockchain, Financial Services, Information Technology",Bitcoin Group SE is a holding company focused on innovative and disruptive business models and technologies.,2008,—,Active,—,—,"Bitcoin Group SE is a holding company focused on innovative and disruptive business models and technologies in the areas of cryptocurrency and blockchain. Bitcoin Group SE owns 100% of the shares in Bitcoin Deutschland AG, which operates Germany's only authorized trading center for the digital currency Bitcoin under Bitcoin.de .","Financial Services, Information Technology",www.bitcoingroup.com/,2008
269,Digital Vega,"Banking, Blockchain, Financial Services, Marketplace, Training",Digital Vega is a privately held company operating the Medusa FX Option trading platform.,2008,$1M to $10M,Active,—,—,"Digital Vega is a privately held and independent company operating the pioneering Medusa FX Options trading platform. Our client driven focus and unparalleled liquidity has made us a market leader in the OTC FX Options marketplace.\n\nAs more OTC and derivatives markets migrate to electronic trading models, driven either by regulations or client demand, we will be at the forefront of those new developments. We will continue to add new financial products and support new asset classes and bring new market innovations to our award winning platform.","Commerce and Shopping, Education, Financial Services, Lending and Investments",www.digitalvega.com,2008


### Create histograms for different years

In [34]:
def get_column_words(df, column_name, year=None):
    if year:
        df = df[df['Founded Year'] == year]
    words_list = []
    for words in df[column_name].tolist():
        words = list(map(str.strip, words.split(',')))
        words = list(filter(lambda x: x != '—', words))
        words_list.extend(words)
    return words_list

def get_column_words_unique(df, column_name, year=None):
    words_list = get_column_words(df, column_name, year=None)
    unique_words_list = sorted(list(set(words_list)))
    return unique_words_list

def list_to_dict(list_item, vocab=None):
    if vocab:
        words_count_dict = dict.fromkeys(vocab,0)
        for word in list_item:
            words_count_dict[word] += 1
        return words_count_dict
    return Counter(list_item)

def show_word_cloud(words_list, title=''):
    words_count_dict = list_to_dict(words_list)
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white',
                    min_font_size = 10)
    wordcloud.generate_from_frequencies(words_count_dict)
                    
    plt.figure(figsize = (3, 3), facecolor = None)
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    if title != '':
        plt.title(title)
    plt.show()
    
def create_histogram(words_list, vocab, title='', savepath='', max_y=None, show=True):
    words_count_dict = get_hist_data(words_list, vocab)
        
    plt.ioff()
    fig = plt.figure(figsize = (10, 10), facecolor = None);
    if title != '':
        plt.title(title);
    plt.bar(list(words_count_dict.keys()), words_count_dict.values(), color='g');
    plt.xticks(rotation='vertical');
    if max_y:
        plt.ylim((0, max_y));
    plt.tight_layout();
    if savepath != '':
        plt.savefig(savepath);
        plt.close(fig)
        print('histogram saved: %s' % savepath)
    if show:
        plt.show()
        
    
def get_hist_data(words_list, vocab):
    words_count_dict = list_to_dict(words_list, vocab)
    words_count_dict = dict(sorted(words_count_dict.items(), key=lambda x: x[0]))
    return words_count_dict

In [35]:
hist_data = []

categories_vocab = get_column_words_unique(df, 'Category Groups')
for year in unique_years:
    categories = get_column_words(df, 'Category Groups', year=year)
    create_histogram(categories, categories_vocab, title=year, max_y=700, show=False, savepath='images/Category_Groups_%s.png' % year)

histogram saved: images/Category_Groups_2008.png
histogram saved: images/Category_Groups_2009.png
histogram saved: images/Category_Groups_2010.png
histogram saved: images/Category_Groups_2011.png
histogram saved: images/Category_Groups_2012.png
histogram saved: images/Category_Groups_2013.png
histogram saved: images/Category_Groups_2014.png
histogram saved: images/Category_Groups_2015.png
histogram saved: images/Category_Groups_2016.png
histogram saved: images/Category_Groups_2017.png
histogram saved: images/Category_Groups_2018.png
histogram saved: images/Category_Groups_2019.png
histogram saved: images/Category_Groups_2020.png
