In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the dataset
df = pd.read_csv('startup_funding.csv')

In [3]:
df.head(2)

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [5]:
#changing the datetime
df.drop(columns=['Remarks'], inplace = True)

In [6]:
df.set_index('Sr No', inplace = True)

In [7]:
#Renaming columns for further easy use
df.rename(columns={
    'Date dd/mm/yyyy': 'date',
    'Startup Name': 'startup',
    'InvestmentnType' : 'round',
    'Industry Vertical' : 'vertical',
    'SubVertical' : 'subVertical',
    'City  Location': 'city',
    'Investors Name' : 'investors',
    'Amount in USD' : 'amount'
    
}, inplace = True)

In [8]:
df.head()

Unnamed: 0_level_0,date,startup,vertical,subVertical,city,investors,round,amount
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000
2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394
3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860
4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000
5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000


In [9]:
#Converting amount to float firstly
df['amount'] = df['amount'].fillna('0')

In [10]:
df.info

<bound method DataFrame.info of              date                       startup             vertical  \
Sr No                                                                  
1      09/01/2020                        BYJU’S               E-Tech   
2      13/01/2020                        Shuttl       Transportation   
3      09/01/2020                     Mamaearth           E-commerce   
4      02/01/2020  https://www.wealthbucket.in/              FinTech   
5      02/01/2020                        Fashor  Fashion and Apparel   
...           ...                           ...                  ...   
3040   29/01/2015                    Printvenue                  NaN   
3041   29/01/2015                      Graphene                  NaN   
3042   30/01/2015                Mad Street Den                  NaN   
3043   30/01/2015                     Simplotel                  NaN   
3044   31/01/2015              couponmachine.in                  NaN   

                               

In [11]:
#Converting amount of float to int
# df['amount'] = df['amount'].astype(int)
# can't convert directly beacuse we have ',' in between
df['amount'] = df['amount'].str.replace(',','')
df['amount'] = df['amount'].str.replace('undisclosed','')
df['amount'] = df['amount'].str.replace('unknown','')
df['amount'] = df['amount'].str.replace('Undisclosed','')
# df['amount'] = df['amount'].astype(int)

In [12]:
df = df[df['amount'].str.isdigit()]

In [13]:
df['amount'] = df['amount'].astype(float)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3022 entries, 1 to 3044
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         3022 non-null   object 
 1   startup      3022 non-null   object 
 2   vertical     2851 non-null   object 
 3   subVertical  2097 non-null   object 
 4   city         2842 non-null   object 
 5   investors    2998 non-null   object 
 6   round        3020 non-null   object 
 7   amount       3022 non-null   float64
dtypes: float64(1), object(7)
memory usage: 212.5+ KB


In [15]:
df.sample(3)

Unnamed: 0_level_0,date,startup,vertical,subVertical,city,investors,round,amount
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1476,24/08/2016,UrbanLadder,eCommerce,Online Furniture Store,Mumbai,Trifecta Capital,Private Equity,3000000.0
2845,20/04/2015,1mg (Healthkartplus),Online Pharmacy & Drug DB,,Gurgaon,"Deep Kalra, Sequoia Capital, Omidyar Network, ...",Private Equity,6000000.0
1709,17/05/2016,Crownit,Consumer Internet,Mobile customer rewards management platform,Gurgaon,"Undisclosed investors, Accel Partners, Helion ...",Private Equity,0.0


In [16]:
def to_inr(dollar):
    inr = dollar * 82.5
    return inr/10000000

In [17]:
df['amount'] = df['amount'].apply(to_inr)

In [18]:
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')

In [19]:
df.head()

Unnamed: 0_level_0,date,startup,vertical,subVertical,city,investors,round,amount
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2020-09-01,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,1650.0
2,2020-01-13,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,66.39925
3,2020-09-01,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,151.460595
4,2020-02-01,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,24.75
5,2020-02-01,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,14.85


In [20]:
df['date'].dt.day

Sr No
1        1.0
2       13.0
3        1.0
4        1.0
5        1.0
        ... 
3040    29.0
3041    29.0
3042    30.0
3043    30.0
3044    31.0
Name: date, Length: 3022, dtype: float64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3022 entries, 1 to 3044
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         3019 non-null   datetime64[ns]
 1   startup      3022 non-null   object        
 2   vertical     2851 non-null   object        
 3   subVertical  2097 non-null   object        
 4   city         2842 non-null   object        
 5   investors    2998 non-null   object        
 6   round        3020 non-null   object        
 7   amount       3022 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 212.5+ KB


In [23]:
df = df.dropna(subset = ['date','startup','vertical','city','investors','round','amount'])

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2814 entries, 1 to 2873
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         2814 non-null   datetime64[ns]
 1   startup      2814 non-null   object        
 2   vertical     2814 non-null   object        
 3   subVertical  2070 non-null   object        
 4   city         2814 non-null   object        
 5   investors    2814 non-null   object        
 6   round        2814 non-null   object        
 7   amount       2814 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 197.9+ KB
