In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
master_df = pd.read_csv("../input/startup-echosystem/startup_ecosystem_funds.csv")

In [None]:
master_df.head()

In [None]:
master_df.isnull().sum()

In [None]:
master_df.info()

# Feature Exploration

In [None]:
#descarding column (SNo, Remark) for now as they may not add any value to model
fund_df = pd.DataFrame()
fund_df = master_df.drop(['SNo','Remarks'],axis=1)

In [None]:
fund_df.head()

#### Feature : StartupName

In [None]:
fund_df.StartupName.isnull().sum()

In [None]:
fund_count  = fund_df.groupby('StartupName').size()

In [None]:
fund_count.head()

In [None]:
sort_count = fund_count.sort_values(ascending=False)

In [None]:
sort_count.head()

In [None]:
sort_df = sort_count.to_frame(name='count')

In [None]:
sort_df.head()

In [None]:
sort_df['StartUpName'] = sort_df.index

In [None]:
sort_df.head()

In [None]:
top_20 = sort_df.head(20)

In [None]:
top_20

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(data=top_20,x='StartUpName', y='count',color='green')
plt.title('No of time startup get funds')
plt.xticks(rotation=70)

In [None]:
#creating sub df 
df_conti = pd.DataFrame()

### Feature : Date

In [None]:
fund_df.Date.isnull().sum()

In [None]:
fund_df['Date'] = fund_df.Date.str.replace('.',"/")
fund_df['Date'] = fund_df.Date.str.replace('//',"/")

In [None]:
#Lets take out year from date column to see which year has hightest amount has funded
fund_df['Date'] = pd.to_datetime(fund_df.Date)

In [None]:
fund_df['Year'] = fund_df['Date'].dt.year

In [None]:
fund_df['Year'].head()

In [None]:
plt.figure(figsize=(20,2))
sns.countplot(data=fund_df, y=fund_df.Year)
plt.title('Most number of funds allocated by year')

In [None]:
df_conti['Year'] = fund_df['Year']

### Feature : IndustryVertical

In [None]:
fund_df.IndustryVertical.isnull().sum()

In [None]:
#we need to take care about missing values
fund_df.IndustryVertical.value_counts().head()

In [None]:
fund_df.IndustryVertical = fund_df.IndustryVertical.str.replace('eCommerce','ECommerce')

In [None]:
fund_df['IndustryVertical'] = fund_df['IndustryVertical'].fillna(method='ffill')

In [None]:
fund_df.IndustryVertical.value_counts().head()

###### To find out top 20 verticals we need to deal first with AmountInUSD columns missing values

In [None]:
fund_df.AmountInUSD.isnull().sum()

Missing values are quite high

In [None]:
fund_df.AmountInUSD = fund_df.AmountInUSD.str.replace(',','')

In [None]:
fund_df.AmountInUSD = pd.to_numeric(fund_df.AmountInUSD)

In [None]:
fund_df.AmountInUSD = fund_df.AmountInUSD.fillna(fund_df.AmountInUSD.mean())

In [None]:
fund_df.AmountInUSD = pd.to_numeric(fund_df.AmountInUSD)

###### We have used mean to filling null values. There could be other better way to fill these value but for now we will work with mean. Now lets find out top 20 vertical who got most funds

In [None]:
vertical_sort = fund_df.sort_values(['AmountInUSD'])
vertical_group = vertical_sort.groupby('IndustryVertical').sum()
vertical_group.sort_values(by='AmountInUSD',ascending=False,inplace=True)

In [None]:
top_20 = vertical_group.head(20)
top_20

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(data=top_20,x=top_20.index, y='AmountInUSD')
plt.title('Total funding to Domains')
plt.xticks(rotation=90)

In [None]:
df_conti['IndustryVertical'] = fund_df['IndustryVertical']

In [None]:
df_conti.head()  #Sub dataframe for modeling 

### Feature : SubVertical

In [None]:
fund_df.SubVertical.isnull().sum()

In [None]:
fund_df.SubVertical.value_counts().head(10)

In [None]:
fund_df['SubVertical'] = fund_df['SubVertical'].fillna(method='ffill')
fund_df.SubVertical.value_counts().head(10)

In [None]:
subvertical_group = vertical_sort.groupby('SubVertical').sum()
subvertical_group.sort_values(by='AmountInUSD',ascending=False,inplace=True)

In [None]:
top_20 = subvertical_group.head(20)
top_20.head()

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(data=top_20,x=top_20.index, y='AmountInUSD')
plt.title('Total funding to Domains')
plt.xticks(rotation=90)

In [None]:
df_conti['SubVertical'] = fund_df['SubVertical']

### Feature : CityLocation

In [None]:
fund_df.CityLocation.isnull().sum()

In [None]:
fund_df.CityLocation.value_counts().head()

As this is the known fact that most number of startup are in Bangalore then we are filling null values with Bangalore for now

In [None]:
fund_df['CityLocation'] = fund_df['CityLocation'].fillna('Bangalore')

In [None]:
fund_df.CityLocation.value_counts().head()

In [None]:
startup_count_by_cities  =  fund_df.CityLocation.value_counts()

In [None]:
cities_df = startup_count_by_cities.to_frame(name='count')
cities_df['cities'] = startup_count_by_cities.index

In [None]:
top_20 = cities_df.head(20)

In [None]:
#import squarify
#plt.figure(figsize=(15,8))
#count = fund_df['CityLocation'].value_counts()
#squarify.plot(sizes=count.values,label=count.index, value=count.values)
#plt.title('Distribution of Startups across Top cities')

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(data=top_20,x='count', y='cities')
plt.title('Number of startups by cities')
plt.xlabel('Number of start ups')
plt.xticks(rotation=90)

In [None]:
df_conti['CityLocation'] = fund_df['CityLocation']

### Feature : InvestorsName

In [None]:
fund_df.InvestorsName.isnull().sum()

In [None]:
fund_df.InvestorsName = fund_df.InvestorsName.str.replace('Undisclosed investors','Undisclosed Investors')

In [None]:
fund_df.InvestorsName.value_counts().head()

In [None]:
investor_group = vertical_sort.groupby(['IndustryVertical','InvestorsName']).sum()
investor_group.sort_values(by='AmountInUSD',ascending=False,inplace=True)
investor_relation = investor_group.index
investor_relation = investor_relation.to_frame()

In [None]:
investor_relation.head()

### Feature : InvestmentType

In [None]:
fund_df.InvestmentType.isnull().sum()

In [None]:
fund_df['InvestmentType'] = fund_df['InvestmentType'].fillna(method='ffill')

In [None]:
fund_df['InvestmentType'].head()

In [None]:
plt.figure(figsize=(20,2))
sns.countplot(data=fund_df,y=fund_df.InvestmentType)
plt.title('Most number to funding type')

In [None]:
df_conti['InvestmentType'] = fund_df['InvestmentType']

In [None]:
df_conti.head()

### Feature: AmountInUSD

As we already handle AmountInUSD column missing data while finding top 20 IndustryVertical. So lets explore other things in this feature

In [None]:
fund_df.AmountInUSD.isnull().sum()

In [None]:
sns.distplot(fund_df.AmountInUSD)

In [None]:
sns.boxplot(fund_df.AmountInUSD)

In [None]:
fund_df.AmountInUSD.shape

In [None]:
fund_df[fund_df.AmountInUSD>10000000].shape

In [None]:
print(fund_df.AmountInUSD.mean())
print(fund_df.AmountInUSD.min())
print(fund_df.AmountInUSD.max())

In [None]:
fund_df.AmountInUSD.sort_values(ascending=False).head(5)

We can see the differe in min and max has lots of difference that one reason our plot looking concentration at one area. 

In [None]:
df_conti['AmountInUSD'] = fund_df['AmountInUSD']

In [None]:
#top 20 startups by word count

In [None]:
df_conti.head()

In [None]:
df_encod = pd.DataFrame()
df_encod = df_conti

# Feature Encoding 

we will use df_encod sub dataframe for encoding and model buidling. 

Most of our independent features are categorical and we all know algorithm work best with continous data. We have deal with these categorical features. There are lots of different way to convert categorical data into numeric.Categerical feature are mostly divided into two category

Nominal (Which we can't rank like cities)

Ordinal (Which we can rant like degrees(student))

our feature are mostly lies under Nomial category and its has categorial to numeric technique call ONE HOT ENCODING. But its comes with cons call curse of dimensity becuase it will create lots of columns and that will not best fit to create the mode. 

Other alteration we can do count/fequency techniques or

just take top 10 categories and covert them into numeric and leave the rest

We are performing with top 10 categories and one hot encode

In [None]:
cat = df_encod.iloc[:,0:4]

In [None]:
cat.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def one_hot_encode(df,columnName):
    top10 = df[columnName].value_counts().sort_values(ascending=False).head(10).index
    for label in top10:
        df[columnName+"_"+label] = np.where(df[columnName]==label,1,0)

In [None]:
one_hot_encode(cat,'IndustryVertical')
one_hot_encode(cat,'SubVertical')
one_hot_encode(cat,'CityLocation')

In [None]:
cat.head()

# Bulding Models

In [None]:
X = cat.iloc[:,4:]

In [None]:
X.head()

In [None]:
y = df_encod.iloc[:,-1]

In [None]:
y.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2)

### Linear Regression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print('RMSE :  ',np.sqrt(mean_squared_error(y_test,y_pred)))

### K Neariest Neighbour

In [None]:
kn = KNeighborsRegressor(n_neighbors=5)

In [None]:
kn.fit(X_train, y_train)

In [None]:
kn.fit(X_train,y_train)

In [None]:
ky_pred = kn.predict(X_test)

In [None]:
print('RMSE :  ',np.sqrt(mean_squared_error(y_test,ky_pred)))

### Decision Tree Regressor

In [None]:
dt = DecisionTreeRegressor()

In [None]:
dt.fit(X_train, y_train)

In [None]:
dt_pred = dt.predict(X_test)

In [None]:
print('RMSE :  ',np.sqrt(mean_squared_error(y_test,dt_pred)))

### Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators=200)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
print('RMSE :  ',np.sqrt(mean_squared_error(y_test,rf_pred)))

### XG Boost Regressor

In [None]:
xgb = xgb.XGBRFRegressor(objective ='reg:linear')

In [None]:
xgb.fit(X_train,y_train)

In [None]:
xgb_pred = xgb.predict(X_test)

In [None]:
print('RMSE :  ',np.sqrt(mean_squared_error(y_test,xgb_pred)))

### Cat Boost