#  Indian Startup Funding

This dataset has funding information of the Indian startups from January 2015 to August 2017.
It includes columns with the date funded, the city the startup is based out of, the names of the funders, and the amount invested (in USD).

Perform EDA and apply Linear Regression

# Import Libraries & load dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
import statsmodels.api as sms
import scipy.stats as stats
import pylab
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.api import het_goldfeldquandt
from statsmodels.stats.diagnostic import linear_rainbow
from statsmodels.stats.stattools import durbin_watson

In [None]:
startup = pd.read_csv('../input/indian-startup-funding/startup_funding.csv')
startup.tail()

# Check Description & Null values

In [None]:
startup.info()

In [None]:
startup.describe(include=np.object)

In [None]:
startup.duplicated().sum()

# Data Cleaning

In [None]:
startup.drop(['Remarks','SNo'],axis=1,inplace=True)

In [None]:
startup.head(2)

In [None]:
startup['AmountInUSD'] = startup['AmountInUSD'].str.replace(',','')

In [None]:
avgfund = startup[pd.notnull(startup['AmountInUSD'])]['AmountInUSD'].astype(int).mean()
round(avgfund)

In [None]:
startup['AmountInUSD'].fillna(round(avgfund),inplace=True)

In [None]:
startup['AmountInUSD'] = startup['AmountInUSD'].astype(int)

In [None]:
startup.info()

In [None]:
startup['InvestmentType'].fillna(startup['InvestmentType'].mode()[0],inplace=True)

In [None]:
startup.drop('SubVertical',axis=1,inplace=True)

In [None]:
startup['IndustryVertical'] = startup['IndustryVertical'].astype(str)

In [None]:
#startup['IndustryVertical'].value_counts()

In [None]:
def industryMap(category,key):
    startup['IndustryVertical'] = \
    startup['IndustryVertical'].apply(lambda x: category if (np.str.lower(x).find(key) != -1) else x)  

In [None]:
industryMap('Ecommerce','commerce')

In [None]:
industryMap('Logistics','logistic')

In [None]:
industryMap('Health','health')

In [None]:
industryMap('Education','education')

In [None]:
industryMap('Food','food')

In [None]:
industryMap('Grocery','grocer')

In [None]:
industryMap('Technology','analytics')

In [None]:
industryMap('Education','ed-tech')

In [None]:
industryMap('Technology','data')

In [None]:
industryMap('HR','hiring')

In [None]:
industryMap('HR','job')

In [None]:
industryMap('Food','tea')

In [None]:
industryMap('Fashion','fashion')

In [None]:
industryMap('Fashion','apparel')

In [None]:
industryMap('Entertainment','games')

In [None]:
industryMap('Media','news')

In [None]:
industryMap('Finance','payment')

In [None]:
industryMap('Ecommerce','delivery')

In [None]:
industryMap('Wheels','auto')

In [None]:
industryMap('Wheels','car')

In [None]:
industryMap('Wheels','vehicle')

In [None]:
industryMap('Wheels','taxi')

In [None]:
industryMap('Wheels','cab')

In [None]:
industryMap('Food','tiffin')

In [None]:
industryMap('Hospitality','hotel')

In [None]:
industryMap('Finance','finance')

In [None]:
industryMap('Finance','loan')

In [None]:
industryMap('Ecommerce','hyperlocal')

In [None]:
industryMap('Health','homeopathy')

In [None]:
industryMap('Wheels','commute')

In [None]:
industryMap('Hospitality','accomodation')

In [None]:
industryMap('Wheels','bike')

In [None]:
industryMap('Wheels','wheeler')

In [None]:
industryMap('Finance','financ')

In [None]:
industryMap('Finance','wallet')

In [None]:
industryMap('Health','fitness')

In [None]:
industryMap('Hospitality','room')

In [None]:
industryMap('Education','learning')

In [None]:
industryMap('Health','medical')

In [None]:
industryMap('Reality','real estate')

In [None]:
industryMap('Reality','residential')

In [None]:
industryMap('HR','recruitment')

In [None]:
industryMap('Wheels','scooter')

In [None]:
industryMap('Travel','travel')

In [None]:
industryMap('Internet','internet')

In [None]:
industryMap('Internet','web')

In [None]:
industryMap('Food','beverage')

In [None]:
industryMap('Reality','office')

In [None]:
industryMap('Finance','fund')

In [None]:
industryMap('Finance','bill')

In [None]:
industryMap('Ecommerce','shopping')

In [None]:
industryMap('Entertainment','stream')

In [None]:
industryMap('Health','pharmacy')

In [None]:
industryMap('Online','online')

In [None]:
industryMap('Mobile','mobile')

In [None]:
industryMap('Mobile','app')

In [None]:
industryMap('Technology','platform')

In [None]:
industryMap('Marketplace','marketplace')

In [None]:
industryMap('Service','service')

In [None]:
a = (startup['IndustryVertical'].value_counts() == 1)

In [None]:
startup['IndustryVertical'] = startup['IndustryVertical'].\
apply(lambda x: 'Others' if x in a[a == True].index else x)

In [None]:
startup['IndustryVertical'].replace({'nan':np.nan},inplace=True)

In [None]:
startup['IndustryVertical'].fillna(method='ffill',inplace=True)

In [None]:
startup['IndustryVertical'].value_counts()

In [None]:
startup.info()

In [None]:
startup['CityLocation'].fillna(startup['CityLocation'].mode()[0],inplace=True)

In [None]:
startup['InvestorsName'] = startup['InvestorsName'].str.replace(' ','')

In [None]:
startup['InvestorsName'] = startup['InvestorsName'].apply(lambda x: \
        x.replace('TigerGlobalManagement','TigerGlobal') if (np.str.lower(str(x)).find('tigerglobalmanagement') != -1) else x)  

In [None]:
startup['InvestorsName'] = startup['InvestorsName'].apply(lambda x: \
        x.replace('SequoiaIndia','SequoiaCapital') if (np.str.lower(str(x)).find('sequoiaindia') != -1) else x)  

In [None]:
startup['InvestorsName'] = startup['InvestorsName'].apply(lambda x: \
        x.replace('Undisclosedinvestors','UndisclosedInvestors') if (np.str.lower(str(x)).find('undisclosedinvestors') != -1) else x)  

### One hot encoding

In [None]:
dd = startup['InvestorsName'].str.get_dummies(sep=',')

In [None]:
startup = pd.concat([startup,dd],axis=1)
startup.drop('InvestorsName',axis=1,inplace=True)

In [None]:
startup.head()

In [None]:
startup['Date'] = startup['Date'].str.replace('.','/')

In [None]:
startup['Date'] = startup['Date'].str.replace('//','/')

In [None]:
startup['Date'] = pd.to_datetime(startup['Date'])

In [None]:
startup['Year'] = startup['Date'].dt.year

In [None]:
startup['Month'] = startup['Date'].dt.month

In [None]:
startup.drop('Date',axis=1,inplace=True)

In [None]:
startup.head()

# Visualization

## How does the funding ecosystem change with time?

In [None]:
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')
startup.groupby(['Year','Month'])['Month'].count().plot(color='grey')
plt.show()

## Do cities play a major role in funding?

In [None]:
plt.figure(figsize=(15,10))
startup['CityLocation'].value_counts().head(10).plot(kind='pie',autopct='%1.1f%%')
plt.show()

## Which industries are favored by investors for funding?

In [None]:
plt.figure(figsize=(15,10))
startup['IndustryVertical'].value_counts().plot(kind='bar',color='purple')
plt.show()

## Who are the important investors in the Indian Ecosystem?

In [None]:
plt.figure(figsize=(12,10))
dd[dd.columns].apply(lambda x : sum(x.values)).sort_values(ascending=False).head(15).plot.barh(color='r')
plt.show()

## How much funds does startups generally get in India?

In [None]:
startup['StartupName'] = startup['StartupName'].apply(lambda x: \
        'Flipkart' if (np.str.lower(str(x)).find('flipkart') != -1) else x)  

In [None]:
startup['StartupName'] = startup['StartupName'].apply(lambda x: \
        'Ola' if (np.str.lower(str(x)).find('ola') != -1) else x)  

In [None]:
startup['StartupName'] = startup['StartupName'].apply(lambda x: \
        'Oyo' if (np.str.lower(str(x)).find('oyo') != -1) else x)  

In [None]:
startup['StartupName'] = startup['StartupName'].apply(lambda x: \
        'Paytm' if (np.str.lower(str(x)).find('paytm') != -1) else x)  

In [None]:
plt.figure(figsize=(12,10))
sns.barplot(x='StartupName',y='AmountInUSD',data=startup.sort_values('AmountInUSD',ascending=False).head(20))
plt.xticks(rotation=90)
plt.show()

## Nature of Investment?

In [None]:
startup['InvestmentType'] = startup['InvestmentType'].map({'Private Equity':'PrivateEquity','Seed Funding':'SeedFunding','Crowd Funding':'CrowdFunding'})

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(startup['InvestmentType'])
plt.xticks(rotation=90)
plt.show()

### Label encoding

In [None]:
startup['InvestmentType'] = startup['InvestmentType'].astype(str)

In [None]:
startup['InvestmentType'] = LabelEncoder().fit_transform(startup['InvestmentType'])

In [None]:
startup['CityLocation'] = LabelEncoder().fit_transform(startup['CityLocation'])

In [None]:
startup['IndustryVertical'] = LabelEncoder().fit_transform(startup['IndustryVertical'])

In [None]:
startup.head()

# Applying Linear Regression model

In [None]:
#x = startup.drop(['StartupName','AmountInUSD',''],axis=1)
x = startup[['CityLocation','InvestmentType']]
y = startup['AmountInUSD']

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.30,random_state=123)

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(xtrain,ytrain)
ypred = lr.predict(xtest)

In [None]:
r2_score(ytest,ypred)

In [None]:
mean_squared_error(ytest,ypred)

In [None]:
model = sms.OLS(y,x).fit()
model.summary()

# Linear Regression Assumptions

In [None]:
residual = ytest - ypred

### 1. No pattern in residual

In [None]:
sns.residplot(ypred,residual)

### 2. Normal Distribution

In [None]:
stats.probplot(residual,plot=pylab)
plt.show()

In [None]:
test,pvalue = stats.shapiro(residual)
pvalue

### 3. Multicollinearity

In [None]:
vif = [variance_inflation_factor(startup[['CityLocation','InvestmentType','AmountInUSD']].values,i) for i in range(startup[['CityLocation','InvestmentType','AmountInUSD']].shape[1])]

In [None]:
pd.DataFrame({'vif':vif},index=['CityLocation','InvestmentType','AmountInUSD']).T

### 4. Heteroscadastic

if heteroscadastic, linear regression cannot be used. 

h0: residual is not heteroscadastic

h1: residual is heteroscadastic

In [None]:
test,pvalue,result = het_goldfeldquandt(residual,xtest)
pvalue

### 5. Auto-correlation

The errors should not be auto correlated in nature as it will violate the assumptions of the linear regression model.

- Durbin Watson Test

0 to 4

[0-2) - (+)ve coorelation

=2 - no correlation

(2-4] - (-)ve correlaion

In [None]:
durbin_watson(residual)

### 6. Linearity

- Rainbow Test

h0: linear in nature

h1: not linear in nature

In [None]:
test,pvalue = linear_rainbow(model)
pvalue

# Conclusion

- Between 2015-16 ,funding has been high, but after 2016 trend is decreasing.
- Tier-1 has highest share of investment compared to Tier-2.
- Internet,Technology & Ecommerce are the most favoured industries for funding startup.
- Paytm,Flipkart & Ola are the highest funded startups.
- On applying Linear Regression ,accuracy came very small.
- The model failed linear regression assumptions.

So we conclude that linear regression is not good fit here.