In [228]:
import numpy as np
import pandas as pd

# import companies using encoding = "ISO-8859-1"
companies = pd.read_csv("C:/Users/ranji/Desktop/ics/companies.txt", sep="\t", encoding = "ISO-8859-1")

In [229]:
#inspecting companies dataframe
print(companies.describe())
print(companies.isnull().sum())

#finding % of nulls in companies
print(100*round(companies.isnull().sum()/len(companies.index),2))

                   permalink   name                homepage_url category_list  \
count                  66368  66367                       61310         63220   
unique                 66368  66102                       61191         27296   
top     /Organization/Sofive  Blink  http://www.askforoffer.com      Software   
freq                       1      4                           5          3995   

           status country_code state_code       region           city  \
count       66368        59410      57821        58338          58340   
unique          4          137        311         1092           5111   
top     operating          USA         CA  SF Bay Area  San Francisco   
freq        53034        37601      12900         8804           3526   

        founded_at  
count        51147  
unique        3978  
top     01-01-2012  
freq          2730  
permalink            0
name                 1
homepage_url      5058
category_list     3148
status               0
country_

In [230]:
#dropping columns which will not influence analysis
companies.drop(["homepage_url","state_code","region","city","founded_at"],axis=1,inplace=True)

#inspecting companies dataframeafter cleaning
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 5 columns):
permalink        66368 non-null object
name             66367 non-null object
category_list    63220 non-null object
status           66368 non-null object
country_code     59410 non-null object
dtypes: object(5)
memory usage: 2.5+ MB


In [231]:
# import rounds2 using encoding = "ISO-8859-1"
rounds2 = pd.read_csv("C:/Users/ranji/Desktop/ics/rounds2.csv", sep=",", encoding = "ISO-8859-1")

In [232]:
#inspecting rounds2 dataframe
print(rounds2.describe())
print(rounds2.isnull().sum())

#null % in columns of rounds2
print(100*round(rounds2.isnull().sum()/len(rounds2.index),2))

       raised_amount_usd
count       9.495900e+04
mean        1.042687e+07
std         1.148212e+08
min         0.000000e+00
25%         3.225000e+05
50%         1.680511e+06
75%         7.000000e+06
max         2.127194e+10
company_permalink              0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83809
funded_at                      0
raised_amount_usd          19990
dtype: int64
company_permalink           0.0
funding_round_permalink     0.0
funding_round_type          0.0
funding_round_code         73.0
funded_at                   0.0
raised_amount_usd          17.0
dtype: float64


In [233]:
#handling special characters in companies_permalink and permalink column of rounds2 and companies
rounds2['company_permalink']=rounds2['company_permalink'].str.encode('ISO-8859-1').str.decode('ascii','ignore')
companies['permalink']=companies['permalink'].str.encode('ISO-8859-1').str.decode('ascii','ignore')

#converting permalink column of two dataframes to lowercase
rounds2['company_permalink']=rounds2['company_permalink'].str.lower()
companies['permalink']=companies['permalink'].str.lower()

#finding no of unique values in  rounds2 and companies in two different ways
print(rounds2.groupby(['company_permalink']).size().shape[0])
print(companies['permalink'].nunique())

66368
66368


In [234]:
#comparing for unique values in each column and no of entries to decide on id column:- permalink seems to be the ideal 
# key for companies dataframe
print(companies.describe())

                            permalink   name category_list     status  \
count                           66368  66367         63220      66368   
unique                          66368  66102         27296          4   
top     /organization/flip-flop-shops  Blink      Software  operating   
freq                                1      4          3995      53034   

       country_code  
count         59410  
unique          137  
top             USA  
freq          37601  


In [235]:
#creating master_frame after merging rounds2 with companies using left join on company_permalink to permalink to 
#ensure no value gets neglected in rounds2.indicator creates a new column _merge which indicates whether a value in 
#left dataframe is present in right dataframe (both) or not(left_only)

master_frame=rounds2.merge(companies,left_on='company_permalink',right_on='permalink',how='left',indicator=True)

In [236]:
#if entry in _merge column as left_only is 0 then all columns in rounds2 are present in companies 
print(master_frame.loc[master_frame._merge =='left_only'].shape[0])

0


In [237]:
#remove extra column permalink created in master_frame due to merge
master_frame.set_index('company_permalink')
master_frame.drop(['permalink','_merge'],axis=1,inplace=True)
print(master_frame.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114949 entries, 0 to 114948
Data columns (total 10 columns):
company_permalink          114949 non-null object
funding_round_permalink    114949 non-null object
funding_round_type         114949 non-null object
funding_round_code         31140 non-null object
funded_at                  114949 non-null object
raised_amount_usd          94959 non-null float64
name                       114948 non-null object
category_list              111539 non-null object
status                     114949 non-null object
country_code               106271 non-null object
dtypes: float64(1), object(9)
memory usage: 9.6+ MB
None


In [238]:
#dropping null values in raised_amount_usd column as this is the basis for analysis and cannot be null
master_frame = master_frame[~np.isnan(master_frame['raised_amount_usd'])]

In [239]:
#export merged master_frame for tableau analysis
master_frame.to_csv('C:/Users/ranji/Desktop/ics/master.txt', sep="\t")

In [240]:
#identifying most ideal funding type with average between 5 million to 15 million usd
#creating master_group to store groupby object of master_frame
master_group=master_frame.groupby('funding_round_type').raised_amount_usd.mean().sort_values()
master_group[master_group<=15000000].sort_values(ascending=False).index[0]

'venture'

In [241]:
#filtering for funding type as venture
master_frame = master_frame.loc[master_frame.funding_round_type=='venture',:]

In [242]:
#inspecting filtered master_frame
master_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50228 entries, 0 to 114941
Data columns (total 10 columns):
company_permalink          50228 non-null object
funding_round_permalink    50228 non-null object
funding_round_type         50228 non-null object
funding_round_code         28043 non-null object
funded_at                  50228 non-null object
raised_amount_usd          50228 non-null float64
name                       50228 non-null object
category_list              49726 non-null object
status                     50228 non-null object
country_code               48111 non-null object
dtypes: float64(1), object(9)
memory usage: 4.2+ MB


In [243]:
print(100*round(master_frame.isnull().sum()/len(master_frame.index),2))

#neglect rows with null values in country_code. This has to be done because no satisfactory imputation can be done
master_frame = master_frame[~pd.isnull(master_frame['country_code'])]

#finding raised amount of country as % of total raised amount
master_frame.groupby('country_code').raised_amount_usd.sum().sort_values(ascending=False)/master_frame['raised_amount_usd'].sum()*100

company_permalink           0.0
funding_round_permalink     0.0
funding_round_type          0.0
funding_round_code         44.0
funded_at                   0.0
raised_amount_usd           0.0
name                        0.0
category_list               1.0
status                      0.0
country_code                4.0
dtype: float64


country_code
USA    74.830995
CHN     7.055260
GBR     3.585708
IND     2.548945
CAN     1.697306
FRA     1.285738
ISR     1.223392
DEU     1.124112
JPN     0.595742
SWE     0.576485
NLD     0.520598
CHE     0.500790
SGP     0.494831
ESP     0.325145
BRA     0.316317
IRL     0.296860
RUS     0.278139
AUS     0.234305
DNK     0.217546
BEL     0.189604
FIN     0.184761
NOR     0.169351
KOR     0.166482
MYS     0.156399
HKG     0.138370
AUT     0.111749
TWN     0.110513
TUR     0.099022
ITA     0.086481
NZL     0.079401
         ...    
KWT     0.002480
LIE     0.002319
MNE     0.002161
SVN     0.002128
BGR     0.002001
KAZ     0.001948
GRC     0.001903
BAH     0.001576
TTO     0.001505
SVK     0.001460
BGD     0.001240
LBN     0.001143
GGY     0.000701
TUN     0.000694
SEN     0.000507
HRV     0.000466
UGA     0.000443
PER     0.000437
BWA     0.000398
PAN     0.000372
LAO     0.000372
MAR     0.000283
MUS     0.000266
PRI     0.000255
ECU     0.000171
MCO     0.000116
SAU     0.000089
C

In [244]:
#create groupby object countryWise_TotInvest which outputs list of top 9 countries in terms of total funding amount 
#raised to a list object top9List. This is used for creating top9 dataframe.

countryWise_TotInvest=master_frame.groupby("country_code").raised_amount_usd.sum().sort_values(ascending=False).head(9)
top9List=countryWise_TotInvest.index.get_level_values('country_code').tolist()
top9=master_frame[master_frame["country_code"].isin(top9List)]

print(top9)

                                        company_permalink  \
0                                     /organization/-fame   
4                                   /organization/0-6-com   
8                     /organization/0ndine-biomedical-inc   
10                                   /organization/0xdata   
11                                   /organization/0xdata   
12                                   /organization/0xdata   
22                             /organization/1-mainstream   
28                          /organization/10-minutes-with   
34                             /organization/1000memories   
38                          /organization/1000museums-com   
39                          /organization/1000museums-com   
41                          /organization/1000museums-com   
44                          /organization/1000museums-com   
46                               /organization/1001-menus   
47                               /organization/1001-menus   
49                      

In [245]:
#engCountries is list of country codes of english speaking countries from
#Countries_where_English_is_an_official_language.pdf.Select top 3 english speaking countries from top9 dataframe
engCountries=['BWA', 'CMR', 'ETH', 'ERI', 'GMB', 'GHA', 'KEN', 'LSO', 'LBR', 'MWI', 'MUS', 'NAM', 'NGA', 'RWA', 'SYC', 'SLE', 'ZAF', 'SSD', 'SDN', 'SWZ', 'TAN', 'UGA', 'ZMB', 'ZWE', 'ATG', 'BAH', 'BRB', 'BLZ', 'CAN', 'DMA', 'GRD', 'GUY', 'JAM', 'KNA', 'LCA', 'VCT', 'TTO', 'USA', 'AUS', 'FJI', 'KIR', 'MHL', 'FSM', 'NRU', 'NZL', 'PLW', 'PNG', 'WSM', 'SLB', 'TON', 'TUV', 'VUT', 'IND', 'PAK', 'PHL', 'SGP', 'IRL', 'MLT', 'GBR']
print(top9[top9['country_code'].isin(engCountries)].groupby('country_code').raised_amount_usd.sum().sort_values(ascending=False).head(3))

country_code
USA    4.225108e+11
GBR    2.024563e+10
IND    1.439186e+10
Name: raised_amount_usd, dtype: float64


In [246]:
#Assign random value BlankVals to category_list where null values are present.This does not affect the analysis
master_frame.loc[pd.isnull(master_frame['category_list']),'category_list']='BlankVals'

#Split the category_list values on |and assign first category to new column primary_cat
master_frame['primary_cat']=master_frame['category_list'].apply(lambda x:x.split('|')[0])

print(master_frame)

                                        company_permalink  \
0                                     /organization/-fame   
4                                   /organization/0-6-com   
8                     /organization/0ndine-biomedical-inc   
10                                   /organization/0xdata   
11                                   /organization/0xdata   
12                                   /organization/0xdata   
22                             /organization/1-mainstream   
28                          /organization/10-minutes-with   
34                             /organization/1000memories   
38                          /organization/1000museums-com   
39                          /organization/1000museums-com   
41                          /organization/1000museums-com   
44                          /organization/1000museums-com   
46                               /organization/1001-menus   
47                               /organization/1001-menus   
49                      

In [247]:
#import mapping file to map main sector to sub-sectors
mapping = pd.read_csv("C:/Users/ranji/Desktop/ics/mapping.csv", sep=",", encoding = "ISO-8859-1")

#convert the wide format file to long format using melt function of dataframe and filter for values=1
mapping=pd.melt(mapping,id_vars="category_list")
mapping=mapping[mapping['value']==1]

#handling blanks and dirty data in mapping file by replacing 'na' for 0  and Changing Enterprise 2.na to Enterprise 2.0 
mapping['category_list'].fillna(value="BlankVal",inplace=True)
mapping.replace(to_replace='0',value='na',regex=True,inplace=True)
mapping.loc[mapping['category_list']=='Enterprise 2.na']='Enterprise 2.0'

#renaming columns of mapping file and dropping extra 'value' column which contains only 1
mapping.rename(columns={'category_list':'primary_cat','variable':'sector'},inplace=True)
mapping=mapping.drop('value',axis=1)

In [248]:
#merging master_frame and mapping on primary_catcolumn with left join
master_frame=master_frame.merge(mapping,on='primary_cat',how='left')
master_frame.info

#exporting updated master_frame for tableau input
master_frame.to_csv('C:/Users/ranji/Desktop/ics/mapped_master.txt', sep="\t")

In [249]:
#Create D_group which is a sorted series object which returns 3 highest funded english speaking countries 
D_group=master_frame[master_frame['country_code'].isin(engCountries)].groupby('country_code').raised_amount_usd.sum().sort_values(ascending=False).head(3)

#DList is a list object used to store D_group as a list for filtering master_frame
DList=D_group.index.get_level_values('country_code').tolist()

#D is a dataframe which filters master_frame for fund amount between 5 and 15 million and the countries in D_group 
D=master_frame[master_frame["country_code"].isin(DList)]
D=D.loc[(D['raised_amount_usd'] >= 5000000) & (D['raised_amount_usd'] <= 15000000)]

#D1,D2 and D3 are dataframes for 3 highest funded countries
D1=D.loc[(D['country_code']==D_group.index[0])]
D2=D.loc[(D['country_code']==D_group.index[1])]
D3=D.loc[(D['country_code']==D_group.index[2])]

#use transform function on D1, D2 and D3 to create new columns investment_count and investment_amount which gives
#sector wise total investment counts and sum of investment amounts for each of the 3 dataframes
D1['investment_count']=D1.groupby(['sector'])['raised_amount_usd'].transform('count')
D1['investment_amount']=D1.groupby(['sector'])['raised_amount_usd'].transform('sum')
D2['investment_count']=D2.groupby(['sector'])['raised_amount_usd'].transform('count')
D2['investment_amount']=D2.groupby(['sector'])['raised_amount_usd'].transform('sum')
D3['investment_count']=D3.groupby(['sector'])['raised_amount_usd'].transform('count')
D3['investment_amount']=D3.groupby(['sector'])['raised_amount_usd'].transform('sum')

print(D1.info())
print(D2.info())
print(D3.info())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:14]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:15]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:16]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydat

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12150 entries, 5 to 48108
Data columns (total 14 columns):
company_permalink          12150 non-null object
funding_round_permalink    12150 non-null object
funding_round_type         12150 non-null object
funding_round_code         8671 non-null object
funded_at                  12150 non-null object
raised_amount_usd          12150 non-null float64
name                       12150 non-null object
category_list              12150 non-null object
status                     12150 non-null object
country_code               12150 non-null object
primary_cat                12150 non-null object
sector                     12012 non-null object
investment_count           12012 non-null float64
investment_amount          12012 non-null float64
dtypes: float64(3), object(11)
memory usage: 1.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 628 entries, 153 to 48101
Data columns (total 14 columns):
company_permalink          628 non-n

In [250]:
#Analysis results for D1 which is USA
#total investment count
print(D1.raised_amount_usd.count())

#total investment sum
print(D1.raised_amount_usd.sum())
inv_count_group=D1.groupby('sector').raised_amount_usd.count().sort_values(ascending=False)
inv_amt_group=D1.groupby('sector').raised_amount_usd.sum().sort_values(ascending=False)

#top 3 sectors which respect to investment count
print(inv_count_group.head(3))

#highest funded company in topmost sector according to investment count
print(D1[D1['sector']==inv_count_group.index[0]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))

#highest funded company in sector with 2nd highest investment count
print(D1[D1['sector']==inv_count_group.index[1]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))

#print(inv_amt_group)


12150
108531347515.0
sector
Others                                     2950
Social, Finance, Analytics, Advertising    2714
Cleantech / Semiconductors                 2300
Name: raised_amount_usd, dtype: int64
name
Virtustream    64300000.0
Name: raised_amount_usd, dtype: float64
name
SST Inc. (Formerly ShotSpotter)    67933006.0
Name: raised_amount_usd, dtype: float64


In [253]:
#Analysis results for D2 which is GBR
#total investment count
print(D2.raised_amount_usd.count())
#total investment sum
print(D2.raised_amount_usd.sum())
inv_count_group2=D2.groupby('sector').raised_amount_usd.count().sort_values(ascending=False)
inv_amt_group2=D2.groupby('sector').raised_amount_usd.sum().sort_values(ascending=False)
#top 3 sectors which respect to investment count
print(inv_count_group2.head(3))
#highest funded company in topmost sector according to investment count
print(D2[D2['sector']==inv_count_group2.index[0]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))
#highest funded company in sector with 2nd highest investment count
print(D2[D2['sector']==inv_count_group2.index[1]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))

628
5436843539.0
sector
Others                                     147
Social, Finance, Analytics, Advertising    133
Cleantech / Semiconductors                 128
Name: raised_amount_usd, dtype: int64
name
Electric Cloud    37000000.0
Name: raised_amount_usd, dtype: float64
name
Celltick Technologies    37500000.0
Name: raised_amount_usd, dtype: float64


In [254]:
#Analysis results for D3 which is IND
#total investment count
print(D3.raised_amount_usd.count())

#total investment sum
print(D3.raised_amount_usd.sum())
inv_count_group3=D3.groupby('sector').raised_amount_usd.count().sort_values(ascending=False)
inv_amt_group3=D3.groupby('sector').raised_amount_usd.sum().sort_values(ascending=False)

#top 3 sectors which respect to investment count
print(inv_count_group3.head(3)).

#highest funded company in topmost sector according to investment count
print(D3[D3['sector']==inv_count_group.index[0]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))

#highest funded company in sector with 2nd highest investment count
print(D3[D3['sector']==inv_count_group.index[1]].groupby('name').raised_amount_usd.sum().sort_values(ascending=False).head(1))

330
2976543602.0
sector
Others                                     110
Social, Finance, Analytics, Advertising     60
News, Search and Messaging                  52
Name: raised_amount_usd, dtype: int64
name
FirstCry.com    39000000.0
Name: raised_amount_usd, dtype: float64
name
Manthan Systems    50700000.0
Name: raised_amount_usd, dtype: float64
