In [50]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import matplotlib.pyplot as plt
import os
import collections
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.offline as py

In [51]:
py.init_notebook_mode(connected=True)
os.chdir(os.getcwd())

In [52]:
df = pd.read_csv('startup_funding.csv')
df.head()

Unnamed: 0,SNo,Date,StartupName,IndustryVertical,SubVertical,CityLocation,InvestorsName,InvestmentType,AmountInUSD,Remarks
0,0,01/08/2017,TouchKin,Technology,Predictive Care Platform,Bangalore,Kae Capital,Private Equity,1300000.0,
1,1,02/08/2017,Ethinos,Technology,Digital Marketing Agency,Mumbai,Triton Investment Advisors,Private Equity,,
2,2,02/08/2017,Leverage Edu,Consumer Internet,Online platform for Higher Education Services,New Delhi,"Kashyap Deorah, Anand Sankeshwar, Deepak Jain,...",Seed Funding,,
3,3,02/08/2017,Zepo,Consumer Internet,DIY Ecommerce platform,Mumbai,"Kunal Shah, LetsVenture, Anupam Mittal, Hetal ...",Seed Funding,500000.0,
4,4,02/08/2017,Click2Clinic,Consumer Internet,healthcare service aggregator,Hyderabad,"Narottam Thudi, Shireesh Palle",Seed Funding,850000.0,


In [53]:
df.shape #Finding Shape of Dataframe

(2372, 10)

In [54]:
df.isnull().sum() #Calculating number of Null values present in the dataframe.

SNo                    0
Date                   0
StartupName            0
IndustryVertical     171
SubVertical          936
CityLocation         179
InvestorsName          8
InvestmentType         1
AmountInUSD          847
Remarks             1953
dtype: int64

In [55]:
#Dropping Remarks and SubVertical columns due to presence of large Null values
 
del df['SubVertical']
del df['Remarks']

In [56]:
df.isnull().sum()

SNo                   0
Date                  0
StartupName           0
IndustryVertical    171
CityLocation        179
InvestorsName         8
InvestmentType        1
AmountInUSD         847
dtype: int64

In [57]:
#mapping Null values in columns with general or unknown values for better understanding
df['CityLocation'] = df['CityLocation'].fillna('India')
df['InvestorsName'] = df['InvestorsName'].fillna('Unknown')
df['IndustryVertical'] = df['IndustryVertical'].fillna('Unknown/Others')

In [58]:
#Finding Data Types of the columns 
#formatting columns = AmountInUSD and Date

df['AmountInUSD'] = df['AmountInUSD'].apply(lambda x: str(x).replace(',','')).astype('float')
df['AmountInUSD'] = df['AmountInUSD'].fillna(df['AmountInUSD'].mean())    
df = df.dropna()

In [59]:
#Date formatting
def format_Investdate(d):
    if '.' in d:
        d = d.replace('.','/')
    elif '//' in d:
        d = d.replace('//','/')
    return d 

df['Date'] = df['Date'].apply(format_Investdate)
df['Date'] = pd.to_datetime(df['Date'],format='%d/%m/%Y')

In [60]:
df['InvestmentType'].unique()

array(['Private Equity', 'Seed Funding', 'Debt Funding', 'SeedFunding',
       'PrivateEquity', 'Crowd funding', 'Crowd Funding'], dtype=object)

In [61]:
df['InvestmentType'] = df['InvestmentType'].apply(lambda x : x.replace('PrivateEquity','Private Equity'))
df['InvestmentType'] = df['InvestmentType'].apply(lambda x : x.replace('SeedFunding','Seed Funding'))
df['InvestmentType'] = df['InvestmentType'].apply(lambda x : x.replace('Crowd funding','Crowd Funding'))

In [62]:
df['InvestmentType'].unique()

array(['Private Equity', 'Seed Funding', 'Debt Funding', 'Crowd Funding'],
      dtype=object)

In [63]:
df['IndustryVertical'] = df['IndustryVertical'].apply(lambda x : x.replace('ECommerce','eCommerce'))

In [64]:
df['StartupName'] = df['StartupName'].apply(lambda x : x.replace('Ola Cabs','OlaCabs'))
df['StartupName'] = df['StartupName'].apply(lambda x : x.replace('OlaCabs','Ola'))
df['StartupName'] = df['StartupName'].apply(lambda x : x.replace('Olacabs','Ola'))
df['StartupName'] = df['StartupName'].apply(lambda x : x.replace('Flipkart.com','Flipkart'))
df['StartupName'] = df['StartupName'].apply(lambda x : x.replace('Paytm Marketplace','Paytm'))

### Plotting Investment Type against Number of count

In [65]:
a = df.InvestmentType.values
countercoscat = collections.Counter(a)

keytype = list(countercoscat.keys())
populationtype = list(countercoscat.values())


dataa = [go.Bar(
              y = populationtype,
    x = keytype, width = 0.5, 
    marker = dict(color=populationtype,
                 colorscale = 'Portland',
                 showscale= True,
                 reversescale = False),
    opacity = 0.6
        )]

layout = go.Layout(
                 title='Investment Type',
                 hovermode ='closest',
                 yaxis=dict(title='Total',ticklen=1,gridwidth=0.5),showlegend=False
                 )

fig=go.Figure(data=dataa,layout=layout)
py.iplot(fig)

### Plotting IndustrialVertical against Investment Count

In [66]:
dt_amo = df['IndustryVertical'].groupby([df.IndustryVertical]).agg('count').nlargest(10)

keytype = list(dt_amo.keys())
populationtype = list(dt_amo.values)

dataa = [go.Bar(
y = populationtype,
x = keytype,
width=0.5,
marker = dict(color = populationtype,
              colorscale = 'Portland',
              showscale = True,
             ))]

layout = go.Layout(
title = 'Industry Vertical',
hovermode = 'closest',
yaxis = dict(
             title = 'Number of Investment'))

fig=go.Figure(data=dataa,layout=layout)
py.iplot(fig)


### Plotting Top 10 cities got maximum number of funding.

In [67]:
dt_loc = df['CityLocation'].groupby(df.CityLocation).agg('count').nlargest(10)

keytype = list(dt_loc.keys())
populationtype = list(dt_loc.values)

dataa = [go.Bar(
    y = populationtype,
    x = keytype,
    width =0.5,
    marker = dict(
    color = populationtype,
    colorscale = 'Portland',
    )
)]

layout = go.Layout(
title = 'Cities getting maximum number of funding',
hovermode = 'closest',
yaxis = dict(
            title = 'Number of Investment'))

fig = go.Figure(data=dataa,layout=layout)
py.iplot(fig)


### Top 10 having maximum funding amount

In [68]:
dt_cit = df['AmountInUSD'].groupby([df.CityLocation]).agg('sum').nlargest(10)

keytype = list(dt_cit.keys())
populationtype = list(dt_cit.values)

dataa = [go.Bar(
    y = populationtype,
    x = keytype,
    width =0.5,
    marker = dict(
    color = populationtype,
    colorscale = 'Portland',
    )
)]

layout = go.Layout(
title = 'Cities getting maximum funding amount',
hovermode = 'closest',
yaxis = dict(
            title = 'Number of Investment'))

fig = go.Figure(data=dataa,layout=layout)
py.iplot(fig)

### Top 10 Companies by funding

In [69]:
dt_start=df['AmountInUSD'].groupby([df.StartupName]).agg('sum').nlargest(10)

keytype = list(dt_start.keys())
populationtype = list(dt_start.values)

dataa = [go.Bar(
    y = populationtype,
    x = keytype,
    width =0.5,
    marker = dict(
    color = populationtype,
    colorscale = 'Portland',
    )
)]

layout = go.Layout(
title = 'Top 10 Startup by funding Amount',
hovermode = 'closest',
yaxis = dict(
            title = 'Number of Investment'))

fig = go.Figure(data=dataa,layout=layout)
py.iplot(fig)

### Funding on Year Basis

In [70]:
dt_year = df['AmountInUSD'].groupby(pd.DatetimeIndex(df['Date']).year).agg('sum')
keytype = list(dt_year.keys())
populationtype = list(dt_year.values)

dataa = [go.Bar(
    y = populationtype,
    x = keytype,
    width =0.5,
    marker = dict(
    color = populationtype,
    colorscale = 'Portland',
    )
)]

layout = go.Layout(
title = 'funding Amount per year basis',
hovermode = 'closest',
yaxis = dict(
            title = 'Number of Investment'))

fig = go.Figure(data=dataa,layout=layout)
py.iplot(fig)