<h1 style="text-align:center">Analyzing CrunchBase Dataset</h1>

In [26]:
!ls -al

total 10120
drwxrwxrwx 3 dq root     4096 Nov 13 02:32 .
drwxr-xr-x 1 dq dq       4096 Nov 13 03:12 ..
-rwxrwxrwx 1 dq root      605 Nov 13 02:32 Basics.ipynb
-rwxrwxrwx 1 dq root 10339663 Jan  1  1970 crunchbase-investments.csv
drwxr-xr-x 2 dq root     4096 Nov 13 02:32 .ipynb_checkpoints


## Lets Chunk 5000 rows and analyze the data

In [27]:
import pandas as pd
pd.options.display.max_columns = 99
first_five_rows = pd.read_csv('crunchbase-investments.csv', nrows=5)
first_five_rows.head()

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000


## Ran into an issue reading the file as UTF-8 Encoding 

In [28]:
import chardet

with open('crunchbase-investments.csv', mode='rb') as f:
    raw_bytes = f.read(100000)
    
encoding_name = chardet.detect(raw_bytes)
print(encoding_name)

{'encoding': 'Windows-1252', 'confidence': 0.7298014120085498, 'language': ''}


In [29]:
first_5000_rows = pd.read_csv('crunchbase-investments.csv', nrows=5000, encoding='ISO8859-1')
first_5000_rows.head()

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000.0
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000.0
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000.0
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000.0
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000.0


## Lets calculate the missing value counts

In [52]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO8859-1')
missing_value_counts=[]
num_rows = 0
total_memory = 0
col_memory = []

for chunk in chunk_iter:
    num_rows += len(chunk.index)
    missing_value_counts.append(chunk.isna().sum())
    
    # lets calculate the total & column memory
    temp_col_memory = chunk.memory_usage(deep=True)
    col_memory.append(temp_col_memory)
    total_memory += temp_col_memory.sum()
    
# total memeory in MB
total_memory = total_memory/(1024 ** 2)
    
# figure out missing value counts
combined_df = pd.concat(missing_value_counts)
final_df = pd.DataFrame({'Missing_value_counts':combined_df.groupby(combined_df.index).sum()})
#print(final_df['missing_val_counts'].sort_values())

# figure out col memory
combined_df = pd.concat(col_memory)
final_df['col_memory_in_mb'] = round(combined_df.groupby(combined_df.index).sum()/(1024**2),2)
print(final_df)

print('\nNumber of columns: {}'.format(len(final_df.index)))
print('Number of records: {}'.format(num_rows))
print('Total Memory in MB: {}'. format(round(total_memory,2)))
print('Average Col Memory Size in MB: {}'.format(final_df['col_memory_in_mb'].mean()))

                        Missing_value_counts  col_memory_in_mb
company_category_code                    643              3.26
company_city                             533              3.34
company_country_code                       1              3.03
company_name                               1              3.42
company_permalink                          1              3.87
company_region                             1              3.25
company_state_code                       492              2.96
funded_at                                  3              3.38
funded_month                               3              3.23
funded_quarter                             3              3.23
funded_year                                3              0.40
funding_round_type                         3              3.25
investor_category_code                 50427              0.59
investor_city                          12480              2.75
investor_country_code                  12001           

### We can ignore company_permalink, investor_permalink, funded_month,funded_quarter and funded_year

In [58]:
keep_cols = set(chunk.columns) - set(['company_permalink', 'investor_permalink'\
                                   , 'funded_month','funded_quarter', 'funded_year'])
keep_cols

{'company_category_code',
 'company_city',
 'company_country_code',
 'company_name',
 'company_region',
 'company_state_code',
 'funded_at',
 'funding_round_type',
 'investor_category_code',
 'investor_city',
 'investor_country_code',
 'investor_name',
 'investor_region',
 'investor_state_code',
 'raised_amount_usd'}

In [61]:
# find difference between two lists
def diff_bn_lists(l1, l2):
    return list(set(l1).symmetric_difference(l2))

In [63]:
# initialize
numeric_cols, string_cols = [],[]
nums_in_strings = {}
num_rows = 0
date_cols = ['funded_at']


# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, usecols=keep_cols\
                         , encoding='ISO8859-1')


# lets determine the numeric and string columns first
for chunk in chunk_iter:        
    # lets handle date columns
    for col in date_cols:
        chunk[col] = pd.to_datetime(chunk[col], format='%Y-%m-%d')
        
    # lets figure out the objects and numericals
    temp_numeric_cols = list(chunk.select_dtypes(include=['int','float']).columns)
    temp_string_cols = list(chunk.select_dtypes(include=['object']).columns)
    
    # lets handle string columns
    if len(diff_bn_lists(string_cols, temp_string_cols)) > 0:
        string_cols = list(set(string_cols).union(set(temp_string_cols)))
        
    # lets find any numeric columns in string_cols
    temp_nums_in_string = (set(temp_numeric_cols).union(set(numeric_cols)))\
    .intersection(set(string_cols))
    
    # if we find any numbers in string cols remove them and document the value counts
    if len(temp_nums_in_string) > 0:
        
        # lets remove them from numeric_cols
        numeric_cols = list(set(numeric_cols) - temp_nums_in_string)
        temp_numeric_cols = list(set(temp_numeric_cols) - temp_nums_in_string)
        
       
        # lets add them to the nums_in_strings
        for col in temp_nums_in_string:
            # lets find the outliers
            mask = pd.to_numeric(chunk[col], errors='coerce').isna()
            temp_chunk_col = chunk[col][mask]
            # remove the nulls
            temp_chunk_col = temp_chunk_col[~temp_chunk_col.isna()]
            
            # now add them to dictionary
            if not col in nums_in_strings:
                nums_in_strings[col] = [temp_chunk_col.value_counts()]
            else:
                nums_in_strings[col].append(temp_chunk_col.value_counts())
     
    # lets handle numeric columns
    if len(diff_bn_lists(numeric_cols, temp_numeric_cols)) > 0:
        numeric_cols = list(set(numeric_cols).union(set(temp_numeric_cols)))
        
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# lets groupby the value counts and find the sums
for col in nums_in_strings:
    temp_val_col = pd.concat(nums_in_strings[col])
    nums_in_strings[col] = temp_val_col.groupby(temp_val_col.index).sum()
    
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of String Columns: {0} & Number of Numeric Columns: {1}'.\
      format(len(string_cols), len(numeric_cols)))
print('Total Memory in MB: {}'. format(round(total_memory,2)))

Number of String Columns: 13 & Number of Numeric Columns: 1
Total Memory in MB: 38.54


In [64]:
nums_in_strings

{'investor_category_code': Series([], Name: investor_category_code, dtype: int64),
 'investor_city': Series([], Name: investor_city, dtype: int64),
 'investor_country_code': Series([], Name: investor_country_code, dtype: int64),
 'investor_state_code': Series([], Name: investor_state_code, dtype: int64)}

In [65]:
print(string_cols, numeric_cols)

['investor_category_code', 'investor_region', 'investor_city', 'funding_round_type', 'company_category_code', 'investor_country_code', 'company_state_code', 'company_country_code', 'company_region', 'company_city', 'company_name', 'investor_state_code', 'investor_name'] ['raised_amount_usd']


In [70]:
# initialize
unique_string_cols = {}
category_cand_cols = []
float_no_missing_values = {}
integer_cand_cols = []
has_next = True

## lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, usecols=keep_cols\
                         , encoding='ISO8859-1')


# lets determine the numeric and string columns first
for chunk in chunk_iter:         
    for col in string_cols:
        if not col in unique_string_cols:
            unique_string_cols[col] = [pd.Series(chunk[col].unique())]
        else:
            unique_string_cols[col].append(pd.Series(chunk[col].unique()))
            
    for col in numeric_cols:
        if not col in float_no_missing_values:
            float_no_missing_values[col] = [pd.Series(chunk[col].isna().sum())]
        else:
            float_no_missing_values[col].append(pd.Series(chunk[col].isna().sum()))

# figure out the string columns which are candidate for categories
for col in unique_string_cols:
    unique_string_cols[col] = pd.concat(unique_string_cols[col]).unique()
    
    if len(unique_string_cols[col]) <= 100:
        category_cand_cols.append(col)
        
# figure out the numerical columns which are candidate for integers
for col in float_no_missing_values:
    float_no_missing_values[col] = pd.concat(float_no_missing_values[col]).sum()
    
    if float_no_missing_values[col] == 0:
        integer_cand_cols.append(col)
    
print(category_cand_cols)

['company_state_code', 'company_country_code', 'investor_category_code', 'funding_round_type', 'investor_state_code', 'company_category_code', 'investor_country_code']


In [71]:
print(integer_cand_cols)

[]


In [72]:
for col in unique_string_cols:
    print("Number of Unique Values for '{}': {}".format(col, len(unique_string_cols[col])))

Number of Unique Values for 'company_state_code': 51
Number of Unique Values for 'company_country_code': 3
Number of Unique Values for 'company_region': 547
Number of Unique Values for 'investor_category_code': 34
Number of Unique Values for 'investor_region': 586
Number of Unique Values for 'investor_city': 991
Number of Unique Values for 'funding_round_type': 10
Number of Unique Values for 'company_city': 1230
Number of Unique Values for 'company_name': 11574
Number of Unique Values for 'investor_state_code': 51
Number of Unique Values for 'company_category_code': 44
Number of Unique Values for 'investor_name': 10466
Number of Unique Values for 'investor_country_code': 73


In [73]:
for col in float_no_missing_values:
    print("Number of Null Values for '{}': {}".format(col, float_no_missing_values[col]))

Number of Null Values for 'raised_amount_usd': 3599


In [75]:
# initialize
total_memory = 0
num_rows = 0

## lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, usecols=keep_cols\
                         , encoding='ISO8859-1')


# lets determine the numeric and string columns first
for chunk in chunk_iter:
        
    # lets handle date columns
    for col in date_cols:
        chunk[col] = pd.to_datetime(chunk[col], format='%Y-%m-%d')
        
    for col in category_cand_cols:
        chunk[col] = chunk[col].astype('category')
    
    for col in integer_cand_cols:
        chunk[col] = chunk[col].astype('int')
        chunk[col] = pd.to_numeric(chunk[col], downcast='integer')
        
    for col in set(numeric_cols)- set(integer_cand_cols):
        chunk[col] = chunk[col].astype('float')
        chunk[col] = pd.to_numeric(chunk[col], downcast='float')
        
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of rows: {} & Total Memory in MB: {}'. format(num_rows, round(total_memory,2)))


Number of rows: 52870 & Total Memory in MB: 20.86


In [77]:
import sqlite3
conn = sqlite3.connect('crunch.db')
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, usecols=keep_cols\
                         , encoding='ISO8859-1')


# lets determine the numeric and string columns first
for chunk in chunk_iter:
        
    # lets handle date columns
    for col in date_cols:
        chunk[col] = pd.to_datetime(chunk[col], format='%Y-%m-%d')
        
    for col in category_cand_cols:
        chunk[col] = chunk[col].astype('category')
    
    for col in integer_cand_cols:
        chunk[col] = chunk[col].astype('int')
        chunk[col] = pd.to_numeric(chunk[col], downcast='integer')
        
    for col in set(numeric_cols)- set(integer_cand_cols):
        chunk[col] = chunk[col].astype('float')
        chunk[col] = pd.to_numeric(chunk[col], downcast='float')

    # load into investments table
    chunk.to_sql("investments", conn, if_exists='append', index=False)

In [80]:
chunk.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 50000 to 52869
Data columns (total 15 columns):
company_name              2870 non-null object
company_category_code     2860 non-null category
company_country_code      2870 non-null category
company_state_code        2841 non-null category
company_region            2870 non-null object
company_city              2825 non-null object
investor_name             2870 non-null object
investor_category_code    0 non-null category
investor_country_code     0 non-null category
investor_state_code       0 non-null category
investor_region           2870 non-null object
investor_city             0 non-null float64
funding_round_type        2870 non-null category
funded_at                 2870 non-null datetime64[ns]
raised_amount_usd         2580 non-null float32
dtypes: category(7), datetime64[ns](1), float32(1), float64(1), object(5)
memory usage: 1011.8 KB


In [82]:
cur = conn.cursor()
cur.execute('pragma table_info(investments)')
results_df = cur.fetchall()
results_df

[(0, 'company_name', 'TEXT', 0, None, 0),
 (1, 'company_category_code', 'TEXT', 0, None, 0),
 (2, 'company_country_code', 'TEXT', 0, None, 0),
 (3, 'company_state_code', 'TEXT', 0, None, 0),
 (4, 'company_region', 'TEXT', 0, None, 0),
 (5, 'company_city', 'TEXT', 0, None, 0),
 (6, 'investor_name', 'TEXT', 0, None, 0),
 (7, 'investor_category_code', 'TEXT', 0, None, 0),
 (8, 'investor_country_code', 'TEXT', 0, None, 0),
 (9, 'investor_state_code', 'TEXT', 0, None, 0),
 (10, 'investor_region', 'TEXT', 0, None, 0),
 (11, 'investor_city', 'TEXT', 0, None, 0),
 (12, 'funding_round_type', 'TEXT', 0, None, 0),
 (13, 'funded_at', 'TIMESTAMP', 0, None, 0),
 (14, 'raised_amount_usd', 'REAL', 0, None, 0)]

In [84]:
cur.execute('select count(*) from investments')
cur.fetchall()

[(52870,)]

In [88]:
investments = pd.read_sql('select * from investments limit 10',conn)
investments.head(5)

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,raised_amount_usd
0,AdverCar,advertising,USA,CA,SF Bay,San Francisco,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30 00:00:00,2000000.0
1,LaunchGram,news,USA,CA,SF Bay,Mountain View,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23 00:00:00,20000.0
2,uTaP,messaging,USA,,United States - Other,,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01 00:00:00,20000.0
3,ZoopShop,software,USA,OH,Columbus,columbus,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15 00:00:00,20000.0
4,eFuneral,web,USA,OH,Cleveland,Cleveland,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08 00:00:00,20000.0
