In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/indian-startup-funding-jan-2015-april-2021/indian_startup_funding.csv')

# 1. Preprocessing and Grasping Whole Data

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df=df.fillna({'Date': '2015-01-02', 'Industry': 'Others', 'Sub-vertical': 'Others','Location':'Others','Investors':'Others','Invest type':'Others'})

In [None]:
df.dtypes

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# 1) Time series analysis 2015 to 2021

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year']=df['Date'].dt.year
df.groupby('Year')['Amount in USD'].sum().plot.bar(title='Total Investment Amount by Year')

In [None]:
df.groupby('Year')['Amount in USD'].count().plot.bar(title='Total Investment Number by Year')

In [None]:
df.groupby('Year')['Amount in USD'].mean().plot.bar(title='Investment Average Amount by Year')

# Total investment number has been decreasing from 2015 to 2021. But avarage investment amount of 2021 is highest! The next highest average year is 2019.

In [None]:
df['Startup Name']=df['Startup Name'].str.lower()
df=df.replace(' ','',regex=True)
df=df.replace('-','',regex=True)

In [None]:
df['Industry']=df['Industry'].str.lower()
df['Sub-vertical']=df['Sub-vertical'].str.lower()
df['Location']=df['Location'].str.lower()
df['Investors']=df['Investors'].str.lower()
df['Investment Type']=df['Investment Type'].str.lower()

# 2) Scatterplotting by 'Year','Amount in USD' and 'Industry'

In [None]:
plt.figure(figsize=(12, 12))
plt.legend(fontsize=10)
plt.tick_params(labelsize=10)
ax=sns.scatterplot(x=df['Year'],y=df['Amount in USD'],hue=df['Industry'],size=df['Amount in USD'],data=df,sizes=(50,500))
plt.xticks(rotation=90)
ax.legend(loc='upper left',bbox_to_anchor=(1,1),ncol=3)

# 3) Scatterplotting by 'Location','Amount in USD' and 'Industry'

In [None]:
plt.figure(figsize=(18,12))
plt.legend(fontsize=10)
plt.tick_params(labelsize=10)
ax=sns.scatterplot(x=df['Location'],y=df['Amount in USD'],hue=df['Industry'],size=df['Amount in USD'],data=df,sizes=(50,500))
plt.xticks(rotation=90)
ax.legend(loc='upper left',bbox_to_anchor=(1,1),ncol=3)

# 4) Top 30 start ups which took lots of investment

In [None]:
df.groupby('Startup Name')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 5) Top 30 industries which took lots of investment

In [None]:
df.groupby('Industry')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 6) Top 30 sub-verticals which took lots of investment

In [None]:
df.groupby('Sub-vertical')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 7) Top 30 locations which took lots of investment

In [None]:
df.groupby('Location')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 8) Top 30 investors which took lots of investment

In [None]:
df.groupby('Investors')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 9) Top 30 investment types which took lots of investment

In [None]:
df.groupby('Investment Type')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 2. Investment by SoftBnak Grpoup

In [None]:
df_sb=df[(df['Investors'].str.contains('softbank', regex=False))]

In [None]:
df_sb.head()

# 1) Time series analysis 2015 to 2021

In [None]:
df_sb.groupby('Year')['Amount in USD'].sum().plot.bar(title='Total Investment Amount by Year')

In [None]:
df_sb.groupby('Year')['Amount in USD'].count().plot.bar(title='Total Investment Number by Year')

In [None]:
df_sb.groupby('Year')['Amount in USD'].mean().plot.bar(title='Average Investment Amount by Year')

# 2) Scatterplotting by 'Year','Amount in USD' and 'Industry'

In [None]:
plt.figure(figsize=(12, 12))
plt.legend(fontsize=10)
plt.tick_params(labelsize=10)
ax=sns.scatterplot(x=df_sb['Year'],y=df_sb['Amount in USD'],hue=df_sb['Industry'],size=df_sb['Amount in USD'],data=df_sb,sizes=(50,500))
plt.xticks(rotation=90)
ax.legend(loc='upper left',bbox_to_anchor=(1,1))

# 3) Scatterplotting by 'Location','Amount in USD' and 'Industry'

In [None]:
plt.figure(figsize=(12, 12))
plt.legend(fontsize=10)
plt.tick_params(labelsize=10)
ax=sns.scatterplot(x=df_sb['Location'],y=df_sb['Amount in USD'],hue=df_sb['Industry'],size=df_sb['Amount in USD'],data=df_sb,sizes=(50,500))
plt.xticks(rotation=90)
ax.legend(loc='upper left',bbox_to_anchor=(1,1))

# 4) Top 30 start ups which SoftBank Group invested

In [None]:
df_sb.groupby('Startup Name')['Amount in USD'].sum().sort_values(ascending=False).head(30).plot.bar(figsize=(10,8))

# 3. Share of SoftBank Group

In [None]:
df.loc[:,'SB'] = df.loc[:,'Investors'].str.contains('softbank',regex=False)*1

In [None]:
df

# 1) Time series of share of investment by SoftBnak Group by amount

In [None]:
df1=pd.DataFrame(df.groupby(['Year','SB'])['Amount in USD'].sum())
df1=df1.reset_index()
df1=df1.pivot(index='Year', columns='SB', values='Amount in USD')
df1['SB_ratio']=df1[1]/(df1[0]+df1[1])*100
df1['SB_ratio'].plot.bar()

# 2) Time series of share of investment by SoftBnak Group by number

In [None]:
df2=pd.DataFrame(df.groupby(['Year','SB'])['Amount in USD'].count())
df2=df2.reset_index()
df2=df2.pivot(index='Year', columns='SB', values='Amount in USD')
df2['SB_ratio']=df2[1]/(df2[0]+df2[1])*100
df2['SB_ratio'].plot.bar()

# We can find large investmebt in 2017 and aggressive investment in 2021.