**<h1>Indian Startup Data Analysis**


In [None]:
import seaborn as sns
import pandas as pd 
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import cross_val_score, RepeatedKFold
from numpy import mean,std, absolute
import numpy as np

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**<h3> Dataset**

In [None]:
data = pd.read_csv("../input/indian-startup-funding/startup_funding.csv",encoding='utf-8')

**<h3> sample data**

In [None]:
data.head()

In [None]:
print("dataset has row and  columns :" , data.shape)

<h2> Dataset Information

In [None]:
data.info()

<h3> Missing value in the dataset

In [None]:
data.isnull().sum()

<h4> We don't need Date column , so delete this column

In [None]:
data.drop(['Date dd/mm/yyyy'], axis=1,inplace=True)

In [None]:
missing_values = data.isnull().sum()
missing_value_frame = missing_values.to_frame()

missing_value_frame.columns=['count']
missing_value_frame.index.names = ['Name']
missing_value_frame['Name'] = missing_value_frame.index

In [None]:
missing_value_frame.head()

<h3> Graphical view of missing value

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="whitegrid", color_codes = True)
sns.barplot(x='Name', y='count', data=missing_value_frame);
plt.title("Missing Values in DataFrame")
plt.xlabel("Features")
plt.xticks(rotation=45);

<h4> Filling missing value in "investmentType " column by forward fill method(ffill)

In [None]:
data['InvestmentnType'].fillna(method='ffill',inplace=True)

<h4> Investers Name

In [None]:
data['Investors Name'].unique()

<h5>Filling 'Investors Name'

In [None]:
data['Investors Name'] = data['Investors Name'].fillna("Unknown")

<h5> Filling " Industry Vertical " Value 

In [None]:
data['Industry Vertical'] = data['Industry Vertical'].fillna(method='ffill')

<h4> Filling " SubVertical " Value with mode of Subvertical column

In [None]:
data['SubVertical'] = data['SubVertical'].fillna(data['SubVertical'].mode()[0])

<h4> Filling "City Location " Value

In [None]:
data['City  Location'] = data['City  Location'].fillna(data['City  Location'].mode()[0])

<h3> Removing those rows whose column value are: "undisclosed, Undisclosed, unknown"

In [None]:
# drop rows with non numerical funding amount 
data.drop(data.loc[(data['Amount in USD']=='undisclosed') | (data['Amount in USD']=='Undisclosed') | 
                   (data['Amount in USD'].isnull()) | (data['Amount in USD']=='unknown')].index,inplace=True)

Remaing Rows are:

In [None]:
data["Amount in USD"] = data["Amount in USD"].apply(lambda x:(str(x).replace('undisclosed','').replace(",","").replace('Undisclosed','')
                                                              .replace("+","").replace(r'N/A', "others").replace(r'n/a',"others")))


In [None]:
data['Amount in USD'] = data['Amount in USD'].str.replace(r"\\+",'')


In [None]:
data['Amount in USD'] = data['Amount in USD'].str.replace("xc2xa0",'')


In [None]:
#data['Amount in USD'].unique()

<h2> Filling "Remaks" columns

In [None]:
data['Remarks'].fillna('None', inplace=True)

In [None]:
data.head()

**Taking only those rows whose column value other than "others"**

In [None]:
data=data[data['Amount in USD']!='others']

In [None]:
data['Amount in USD']=pd.to_numeric(data['Amount in USD']) 

In [None]:
data['City  Location'] = data['City  Location'].str.replace(r'\\+', '')
data['City  Location'] = data['City  Location'].str.replace("xc2xa0", "")

In [None]:
data['City  Location'].unique()

In [None]:
data['City  Location']=data['City  Location'].apply(lambda x: x.split("/")[0].strip())

In [None]:
data['InvestmentnType']=data['InvestmentnType'].apply(lambda x: x.split("/")[-1])

In [None]:
## creating new list to having startups with their total funding
maxtenstartup=[]
for startup in data['Startup Name'].unique():
    df=data[data['Startup Name']==startup]    ## get the dataframe for each startup
    sum=np.sum(df['Amount in USD'])         ## sum total funding of startup
    maxtenstartup.append([startup,sum])

startup=pd.DataFrame(maxtenstartup,columns=['startup','Revenue" in million"'])
#converting the list to dataframe and sort them by the fundin amount
startup.sort_values(by='Revenue" in million"',ascending=False,inplace=True)

In [None]:
startup

In [None]:
## diving amount by 1M to show data in term of Millions
startup['Revenue" in million"'] /= 1000000
startups=startup[:10]

<h2> Top ten startups and their revenue in millions

In [None]:
startups

In [None]:
revenue = startups['Revenue" in million"']
revenue

<h4> Pie chart representationn of revenue 

In [None]:
fig = plt.figure(figsize =(8, 8))
explode = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0)
plt.pie(revenue, labels = startups['startup'], explode=explode,autopct='%1.1f%%');

In [None]:
## investor func to name undisclosed investors to others
def investor(x):
    x=x.strip()
    if x=='undisclosed' or x=='undisclosed investors' or x=='undisclosed investor':
        return 'others'
    else:
        return x
data["Investors Name"]=data["Investors Name"].apply(investor)

In [None]:
# getting the names of investors
names = data[data['Investors Name']!='others']["Investors Name"]

<h2> WordCloud representation of investors

In [None]:
# creating the wordcloud to observe the most frequent investor
wordcloud = WordCloud(max_font_size=50, width=600, height=300, 
                      background_color='cyan').generate(' '.join(names))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Frequent investors", fontsize=35)
plt.axis("off")
plt.show()

<h2> Famous industries/ sectors to invest

In [None]:
#in which sector there are most startups
d=data[data['Industry Vertical']!='others']['Industry Vertical'].value_counts().head(6)
explode = (0.1, 0, 0, 0,0,0)
fig1,ax1=plt.subplots(figsize=(20,10))

ax1.pie(d.values,explode=explode, labels=d.index,autopct='%1.1f%%', shadow=True, startangle=140)
ax1.axis('equal')
plt.title("Famous industries of startup",fontsize=30)
plt.show()

In [None]:
### aggregate the revenue as per the startup name 
## get the list of all locations 
for i,row in data.iterrows():
    data.at[i,'Location']=list(set(data[data['Startup Name']==row['Startup Name']]['City  Location'].values))
    data.at[i,'Revenue']=float(np.sum(data[data['Startup Name']==row['Startup Name']]['Amount in USD'].values))
    # print(row['Startup Name'])

In [None]:
## remove the duplicate startups rows
data.drop_duplicates(subset='Startup Name',keep=False,ignore_index=True,inplace=True)

In [None]:
##convert location to the one-hot  encoding
mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(data['Location']),columns=mlb.classes_,index=data['Location'].index)

In [None]:
data=pd.concat([data,res],axis=1)  ## concatinating both the dataframe

In [None]:
v = TfidfVectorizer(stop_words='english',analyzer='word',max_features=500) ## vector should create vector for 800 most weightage words
x = v.fit_transform(data['SubVertical'])

In [None]:
data.reset_index(drop=True,inplace=True) 

In [None]:
t=x.toarray()   # converting vector list to array
dt=pd.DataFrame(t)

In [None]:
df=pd.concat([data,dt],axis=1)   ## concatenating the tf-idf vectors with prevoius dataframe

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df['Startup Name']=le.fit_transform(df['Startup Name'])
df['Industry Vertical']=le.fit_transform(df['Industry Vertical'])
df['InvestmentnType']=le.fit_transform(df['InvestmentnType'])


In [None]:
## drop the non-featured attributes
df.drop(labels=['Industry Vertical','SubVertical','Investors Name','Remarks',
                'Amount in USD','Location','City  Location'],inplace=True,axis=1)

In [None]:
df['Revenue']/=1000000

<h2> Ridge Linear Regression </h2>
it performs "L2 regularization (adds a factor of sum of squares of coeffiecient in the objective)".

In [None]:
model= Ridge(normalize=True,alpha=0.1) ## normalize the value for faster convergence while gradient descent
cv = RepeatedKFold(n_splits=20, n_repeats=3, random_state=32)   # k-fold cross validation with k=20
scores = cross_val_score(model,df.loc[:, df.columns != 'Revenue'],df['Revenue'], 
                         scoring='neg_median_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

<h2> Lasso Linear Regression </h2>
Lasso stands for "Least Absolute Shrinkage and selection Operator.

It performs "L1 regularization(adds a factor of sum of absolute value of coefficient in the optimization objective.

In [None]:
clf = linear_model.Lasso(alpha=0.1,normalize=True)
cv = RepeatedKFold(n_splits=20, n_repeats=3, random_state=32)
scores = cross_val_score(clf,df.loc[:, df.columns != 'Revenue'],df['Revenue'],
                         scoring='neg_median_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))