# Analyzing Startup Fundraising Deals from Crunchbase


The data set of investments we'll be exploring is current as of October 2013.

In [1]:
import numpy as np
import pandas as pd

first_five = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1').head()
first_five

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012.0,2000000.0
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012.0,20000.0
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012.0,20000.0
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012.0,20000.0
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011.0,20000.0


In [2]:
# Chunking 5000 rows

chunking = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1', chunksize=5000)
chunk_mem = []
for chunk in chunking:
    chunk_mem.append(chunk.memory_usage(deep=True).sum() / 2**20)
    
chunk_mem

[5.579195022583008,
 5.528186798095703,
 5.535004615783691,
 5.528162956237793,
 5.5243072509765625,
 5.553412437438965,
 5.531391143798828,
 5.509613037109375,
 5.396090507507324,
 4.63945198059082,
 2.663668632507324]

In [3]:
# Each column's missing value counts

chunk_iter = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1', chunksize=5000)
missing = []
for chunk in chunk_iter:
    # floats = chunk.select_dtypes(include=['float'])
    missing.append(chunk.apply(pd.isnull).sum())
    
combined_missing = pd.concat(missing)
combined_missing.groupby(combined_missing.index).sum().sort_values()

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

In [4]:
# Total chunk footprint and each column's memory footprint
chunk_iter = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1', chunksize=5000)
chunk_mem = []
for chunk in chunk_iter:
    chunk_mem.append(chunk.memory_usage(deep=True) / 2**20)

combined_mem = pd.concat(chunk_mem)
print("Total memory footprints: {}".format(combined_mem.sum()))
combined_mem.groupby(combined_mem.index).sum().sort_values()

Total memory footprints: 56.988484382629395


Index                     0.000877
funded_year               0.403366
raised_amount_usd         0.403366
investor_category_code    0.593590
investor_state_code       2.361876
investor_country_code     2.524654
investor_city             2.751430
company_state_code        2.962161
company_country_code      3.025223
funded_quarter            3.226837
funded_month              3.226837
investor_region           3.238946
funding_round_type        3.252704
company_region            3.253541
company_category_code     3.262619
company_city              3.343512
funded_at                 3.378091
company_name              3.424955
investor_name             3.734270
company_permalink         3.869808
investor_permalink        4.749821
dtype: float64

From observation, columns which we can drop because of lack of usefulness for analysis are: company_permalink, investor_permalink, funded_month, funded_quarter, and funded_year.

In [5]:
# convert to category
convert_col_dtypes = {
    "company_category_code": "category", "investor_category_code": "category", 
    "company_country_code": "category", "investor_country_code": "category", "investor_state_code": "category",
    "company_state_code": "category", "company_city": "category", "investor_city": "category"
}

# useful columns
useful_col = ['company_name', 'company_category_code', 'company_country_code', 'company_state_code', 'company_region', 'company_city', 'investor_name', 'investor_category_code', 'investor_country_code', 'investor_state_code', 'investor_region', 'investor_city', 'funding_round_type', 'funded_at', 'raised_amount_usd']

In [10]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1', chunksize=5000, dtype=convert_col_dtypes, parse_dates=["funded_at"])
mv_counts = {}
chunk_mem = []
for chunk in chunk_iter:
    chunk = chunk[useful_col]
    chunk = chunk.dropna(how='all')
    float_cols = chunk.select_dtypes(include=['float'])
    for col in float_cols.columns:
        missing_values = len(chunk) - chunk[col].count()
        if col in mv_counts:
            mv_counts[col] = mv_counts[col] + missing_values
        else:
            mv_counts[col] = missing_values
    chunk_mem.append(chunk.memory_usage(deep=True).sum() / 2**20)
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    

print(chunk.dtypes)
print(sum(chunk_mem))

company_name                      object
company_category_code           category
company_country_code            category
company_state_code              category
company_region                    object
company_city                    category
investor_name                     object
investor_category_code          category
investor_country_code           category
investor_state_code             category
investor_region                   object
investor_city                   category
funding_round_type                object
funded_at                 datetime64[ns]
raised_amount_usd                float32
dtype: object
19.48626708984375


The dtype of each column is already optimized. We can reduce the file to 19.49 MB. It is huge improvement for dataframe optimization, at around 66% optimization.

Now, we need to expand the processing code to export each chunk to a new table in the new SQLite database.

In [11]:
import sqlite3
conn = sqlite3.connect('investments.db')

chunk_iter = pd.read_csv('crunchbase-investments.csv', encoding='Latin-1', chunksize=5000, dtype=convert_col_dtypes, parse_dates=["funded_at"])

chunk_mem = []
for chunk in chunk_iter:
    chunk = chunk[useful_col]
    chunk = chunk.dropna(how='all')
    chunk_mem.append(chunk.memory_usage(deep=True).sum() / 2**20)
    chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float')
    chunk.to_sql('investments', conn, if_exists='append', index=False)

In [12]:
# Now we can query the code

pd.read_sql('select * from investments', conn)

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,raised_amount_usd
0,AdverCar,advertising,USA,CA,SF Bay,San Francisco,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30 00:00:00,2000000.0
1,LaunchGram,news,USA,CA,SF Bay,Mountain View,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23 00:00:00,20000.0
2,uTaP,messaging,USA,,United States - Other,,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01 00:00:00,20000.0
3,ZoopShop,software,USA,OH,Columbus,columbus,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15 00:00:00,20000.0
4,eFuneral,web,USA,OH,Cleveland,Cleveland,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08 00:00:00,20000.0
5,Tackk,web,USA,OH,Cleveland,Cleveland,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-02-01 00:00:00,20000.0
6,Acclaimd,analytics,USA,OH,Columbus,Columbus,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-06-01 00:00:00,20000.0
7,Acclaimd,analytics,USA,OH,Columbus,Columbus,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-08-07 00:00:00,70000.0
8,ToVieFor,ecommerce,USA,NY,New York,New York,2010 NYU Stern Business Plan Competition,,,,unknown,,angel,2010-04-01 00:00:00,75000.0
9,OHK Labs,sports,USA,FL,Palm Beach,Boca Raton,22Hundred Group,,,,unknown,,angel,2011-09-01 00:00:00,100000.0


In [15]:
pd.read_sql('PRAGMA table_info(investments)', conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,company_name,TEXT,0,,0
1,1,company_category_code,TEXT,0,,0
2,2,company_country_code,TEXT,0,,0
3,3,company_state_code,TEXT,0,,0
4,4,company_region,TEXT,0,,0
5,5,company_city,TEXT,0,,0
6,6,investor_name,TEXT,0,,0
7,7,investor_category_code,TEXT,0,,0
8,8,investor_country_code,TEXT,0,,0
9,9,investor_state_code,TEXT,0,,0


In [None]:
!wc