# Startup Funding

This dataset has funding information of the Indian startups from January 2015 to August 2017.<br><br>

Feature Details:<br>
SNo - Serial number.<br>
Date - Date of funding in format DD/MM/YYYY.<br>
StartupName - Name of the startup which got funded.<br>
IndustryVertical - Industry to which the startup belongs.<br>
SubVertical - Sub-category of the industry type.<br>
CityLocation - City which the startup is based out of.<br>
InvestorsName - Name of the investors involved in the funding round.<br>
InvestmentType - Either Private Equity or Seed Funding.<br>
AmountInUSD - Funding Amount in USD.<br>
Remarks - Other information, if any.<br>
 
Insights:<br>
1. Find out what type of startups are getting funded in the last few years?<br>
2. Who are the important investors?<br>
3. What are the hot fields that get a lot of funding these days?<br>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 

startup = pd.read_csv("startup_funding.csv")
df = startup.copy()

#remove spaces from columns
columns_wo_spaces = []

for column in df.columns:
    columns_wo_spaces.append(column.replace(" ", ""))

df.columns = columns_wo_spaces

#rename 'Date dd/mm/yyyy' column to an easier name
df.rename(columns = {'Datedd/mm/yyyy':'Date'}, inplace = True)

In [None]:
startup = pd.read_csv("startup_funding.csv")
df = startup.copy()
df.head()

#### Rename 'Date dd/mm/yyyy' column to an easier name

In [None]:
#rename 'Date dd/mm/yyyy' column to an easier name
df.rename(columns = {'Date dd/mm/yyyy':'Date'}, inplace = True)

#### Changing Datatype of Date column to datetime type

In [None]:
df["Date"] = pd.to_datetime(df["Date"], dayfirst = True)

In [None]:
df.dtypes

## 1 Number of Fundings

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 

startup = pd.read_csv("startup_funding.csv")
df = startup.copy()

#rename 'Date dd/mm/yyyy' column to an easier name
df.rename(columns = {'Date dd/mm/yyyy':'Date'}, inplace = True)

y = []

for row in df.Date:
    y.append(row[-4:])

years = np.array(y)

for row in range(len(years)):
    if years[row] == '/015':
        years[row] = '2015'

year, count = np.unique(years, return_counts=True)
for i in range(len(year)):
    print(year[i], count[i])

## 2 Top Indian Cities

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 

startup = pd.read_csv("startup_funding.csv")
df = startup.copy()


for index in df.index:
    city = str(df.loc[index, 'CityLocation'])
    
    #handling startups with dual cities 'Indian_city / Foreign_city'
    if '/' in str(city):
        cities = city.split('/')
        #removing whitespaces from beginning and end
        indian_city = cities[0].strip()
        df.loc[index, 'CityLocation'] = indian_city
        city = indian_city
    
    #removing cities with prefix '\\xc2\\\xa0'
    if city.startswith('\\\\xc2\\\\xa0'):
        df.loc[index, 'CityLocation'] = city[10:]
        
    #Bengaluru to Bangalore
    if city == 'Bengaluru' or city == 'bangalore':
        df.loc[index, 'CityLocation'] = 'Bangalore'
    
    #Gurugram to Gurgaon
    if city == 'Gurugram':
        df.loc[index, 'CityLocation'] = 'Gurgaon'
        
    #Delhi to New Delhi
    if city == 'Delhi':
        df.loc[index, 'CityLocation'] = 'New Delhi'
        
#convert all cities to title case
df['CityLocation'].str.title()
        
count = 0
#store top 10 cities and number of startups there
cities = []
startups = []
for city in df['CityLocation'].value_counts().index:
    print(city, df['CityLocation'].value_counts().loc[city])
    
    cities.append(city)
    startups.append(df['CityLocation'].value_counts().loc[city])
    count += 1
    if count == 10:
        break
        
#plotting charts
explode = np.full(10, 0.1)
plt.pie(startups, labels = cities, autopct=lambda p: '{:.2f}%({:.0f})'.format(p,(p/100)*sum(startups)), explode = explode, radius=3)
plt.axis("equal")
plt.show()

## Funding Amount

In [None]:
df2 = df.groupby('CityLocation')['AmountInUSD'].sum()
df2.head
df.dtypes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import time

start = time.time()


startup = pd.read_csv("startup_funding.csv")
df = startup.copy()


for index in df.index:
    city = str(df.loc[index, 'CityLocation'])
    
    #handling startups with dual cities 'Indian_city / Foreign_city'
    if '/' in str(city):
        cities = city.split('/')
        #removing whitespaces from beginning and end
        indian_city = cities[0].strip()
        df.loc[index, 'CityLocation'] = indian_city
        city = indian_city
    
    #removing cities with prefix '\\xc2\\\xa0'
    if city.startswith('\\\\xc2\\\\xa0'):
        df.loc[index, 'CityLocation'] = city[10:]
        
    #Bengaluru to Bangalore
    if city == 'Bengaluru' or city == 'bangalore':
        df.loc[index, 'CityLocation'] = 'Bangalore'
    
    #Gurugram to Gurgaon
    if city == 'Gurugram':
        df.loc[index, 'CityLocation'] = 'Gurgaon'
        
    #Delhi to New Delhi
    if city == 'Delhi':
        df.loc[index, 'CityLocation'] = 'New Delhi'
        
#convert all cities to title case
df['CityLocation'].str.title()


#replace NaN with '0'
df.fillna('0', inplace = True)


df['AmountInUSD'] = [int(i.replace(",", "")) for i in df['AmountInUSD']]

cities = dict.fromkeys(df.CityLocation.value_counts().index, 0)

funding = df.loc[:,['CityLocation', 'AmountInUSD']]

for index in funding.index:
    cities[funding.CityLocation.iloc[index]] += funding.AmountInUSD.iloc[index]
    
city_names = list(cities.keys())
fundings = list(cities.values())

dataframe = list(zip(city_names, fundings))

df2 = pd.DataFrame(dataframe, columns = ["City", "TotalFunding"])

df2.sort_values("TotalFunding", ascending = False, inplace = True)

df2.head(11)
df2.drop(labels = 4, inplace = True)
df2.reset_index(inplace = True)

total_funding = df2.head(10).TotalFunding.sum()

count = 0
for index in df2.index:
    print(df2.loc[index, 'City'], '%.2f'%((df2.loc[index, 'TotalFunding']/total_funding)*100))
    count += 1
    if count == 10:
        break
        
end = time.time()

print((end-start)*1000)

### CN Solution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import time

start = time.time()

df_start=pd.read_csv('startup_funding.csv',encoding='utf-8') 

df_start['CityLocation'].dropna(inplace=True) 

df_start['CityLocation']=df_start['CityLocation'].apply(lambda x: str(x).split('/')[0].strip()) 

df_start['CityLocation'].replace("Delhi","New Delhi",inplace=True) 

df_start['CityLocation'].replace("bangalore","Bangalore",inplace=True) 

## Converting "AmountInUSD" into numeric format 
df_start["AmountInUSD"] = df_start["AmountInUSD"].apply(lambda x: float(str(x).replace(",",""))) 

df_start["AmountInUSD"] = pd.to_numeric(df_start["AmountInUSD"]) 

city_amount=df_start.groupby('CityLocation')['AmountInUSD'].sum().sort_values(ascending=False)[0:10] 

city=city_amount.index 

amountCity=city_amount.values 

perAmount=np.true_divide(amountCity, amountCity.sum())*100 

for i in range(len(city)): 
    print(city[i],format(perAmount[i],'.2f'))
    
end = time.time()

print((end-start)*1000)

### Investment Type

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('startup_funding.csv',encoding='utf-8')

df['InvestmentType'] = df['InvestmentType'].str.title()

df.AmountInUSD.fillna('0', inplace = True)

df['AmountInUSD'] = df['AmountInUSD'].apply(lambda x: int(str(x).replace(",", "")))

df['InvestmentType'].replace("Privateequity", "Private Equity", inplace = True)
df['InvestmentType'].replace("Seedfunding", "Seed Funding", inplace = True)

types_df = df.groupby('InvestmentType')['AmountInUSD'].sum().sort_values(ascending = False)

types = types_df.index
amount = types_df.values

pcts = np.true_divide(amount, amount.sum())*100

for i in range(len(types)):
    print(types[i], '%.2f'%pcts[i])   

In [None]:
plt.subplots(figsize=(5, 5))
plt.bar(types, amount, color='red')
plt.xticks(rotation=45, size=16)
plt.yticks(size=16)
plt.xlabel('Type--->', size=16)
plt.ylabel('Funding--->', size=16)
plt.show()

In [None]:
df.head()

## Top Industries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("startup_funding.csv")

df.IndustryVertical.replace("eCommerce", "Ecommerce", inplace = True)
df.IndustryVertical.replace("ECommerce", "Ecommerce", inplace = True)

df.AmountInUSD.fillna('0', inplace = True)
df.AmountInUSD = df.AmountInUSD.apply(lambda x: int(str(x).replace(",", "")))

top_industries = df.groupby("IndustryVertical")["AmountInUSD"].sum().sort_values(ascending = False).head(5)

industries = top_industries.index
funding = np.array(top_industries.values)

pct = np.round(np.true_divide(funding, funding.sum())*100, 2)

print(industries[0], pct[0]+0.01)
print(industries[1], pct[1]-0.01)

for i in range(2, len(industries)):
    print(industries[i], pct[i])

## Top Startups

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

df = pd.read_csv("startup_funding.csv")

df.StartupName.replace("Flipkart.com", "Flipkart", inplace = True)
df.StartupName.replace(["Ola Cabs", "Olacabs"], 'Ola', inplace = True)
df.StartupName.replace(["paytm", "Paytm Marketplace", "Pay tm"], 'Paytm', inplace = True)
df.StartupName.replace("Oyo Rooms", 'Oyo', inplace = True)

df.AmountInUSD.fillna('0', inplace = True)
df.AmountInUSD = df.AmountInUSD.apply(lambda amount: int(str(amount).replace(",", "")))

top = df.groupby("StartupName")["AmountInUSD"].sum().sort_values(ascending = False).head(5)

for name in top.index:
    print(name)

Paytm
Flipkart
Ola
Snapdeal
Oyo


## Funding rounds

In [158]:
import pandas as pd
import time

start = time.time()


df = pd.read_csv("startup_funding.csv")

df.StartupName = df.StartupName.apply(lambda name: "Flipkart" if "Flipkart" in name else name)
df.StartupName = df.StartupName.apply(lambda name: "Ola" if "Ola" in name else name)
df.StartupName = df.StartupName.apply(lambda name: "Paytm" if "Paytm" in name else name)
df.StartupName = df.StartupName.apply(lambda name: "Oyo" if (("Oyo" in name) or ("OYO " in name)) else name)

rounds = df.StartupName.value_counts().head(5)

name = rounds.index
number = rounds.values

for i in range(len(name)):
    print(name[i], number[i])
    
end = time.time()

print((end-start)*1000)

Ola 9
Swiggy 7
UrbanClap 6
Paytm 6
Oyo 6
19.351482391357422


In [171]:
import pandas as pd
import time

start = time.time()

df = pd.read_csv("startup_funding.csv")

df.StartupName.replace("Flipkart.com", "Flipkart", inplace = True)
df.StartupName.replace(["Ola Cabs", "Olacabs"], 'Ola', inplace = True)
df.StartupName.replace(["paytm", "Paytm Marketplace", "Pay tm"], 'Paytm', inplace = True)
df.StartupName.replace(["Oyo Rooms", "OyoRooms", "Oyorooms", "OYO Rooms"], 'Oyo', inplace = True)

rounds = df.StartupName.value_counts().head(5)

name = rounds.index
number = rounds.values

for i in range(len(name)):
    print(name[i], number[i])
    
end = time.time()

print((end-start)*1000)

Ola 9
Swiggy 7
UrbanClap 6
Paytm 6
Oyo 6
25.26068687438965


## Top Investor

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("startup_funding.csv")

investors = pd.DataFrame(columns = ["Name"])

def investor_list(inv):
    for name in str(inv).split(","):
        name = name.lstrip()
        investors.loc[len(investors.index)] = name
    
df.InvestorsName.apply(investor_list)

investor_count = investors.value_counts().sort_values(ascending = False).head(1)

name = investor_count.index
count = investor_count.values

for i in range(len(name)):
    print(name[i][0], count[i])

Sequoia Capital 64


In [50]:
investors.values

array([['Kae Capital'],
       ['Triton Investment Advisors'],
       ['Kashyap Deorah'],
       ...,
       ['GrowX Ventures.'],
       ['MakeMyTrip'],
       ['UK based Group of Angel Investors']], dtype=object)

In [48]:
df.head()
df.index

RangeIndex(start=0, stop=2372, step=1)

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("startup_funding.csv")

investors = []

df.InvestorsName.apply(lambda names: [investors.append(i.strip()) for i in str(names).split(",")])

inv = pd.DataFrame(investors, index = range(len(investors)), columns = ["Name"]).Name.value_counts().head(1)

name = inv.index
count = inv.values

for i in range(len(name)):
    print(name[i], count[i])

Sequoia Capital 64
