In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from imblearn import under_sampling, over_sampling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


In [None]:
data = pd.read_csv('../input/startup-success-prediction/startup data.csv')
data.head()

# **Check Dataset**

In [None]:
data.info()

In [None]:
data.describe()

# **Data Preprocessing**

**Check for missing values and duplicated data**

In [None]:
data.isnull().sum()

In [None]:
data_missing_value = data.isnull().sum().reset_index()
data_missing_value.columns = ['feature','missing_value']
data_missing_value['percentage'] = round((data_missing_value['missing_value']/len(data))*100,2)
data_missing_value = data_missing_value.sort_values('percentage', ascending=False).reset_index(drop=True)
data_missing_value = data_missing_value[data_missing_value['percentage']>0]
data_missing_value

There are 5 missing value in this dataset, namely 'closed_at', 'Unnamed:6', 'age_last_milestone_year', 'age_first_milestone_year', and 'state_code.1'

First, drop column Unnamed:6 and state_code.1 because this is useless features

In [None]:
data = data.drop(['Unnamed: 6'], axis=1)
data = data.drop(['state_code.1'], axis=1)

In [None]:
data.duplicated().sum()

there is no duplicated data. But if we check the duplicated data with subset name, that will appear 1 duplicated data. Drop it

In [None]:
data.duplicated(subset=['name']).sum()

In [None]:
data=data.drop_duplicates(subset=['name'])

Now we fill column 'age_first_milestone_year' and 'age_last_milestone_year' with 0. Zero is the the smallest value assumption for a company that has not passed its first milestone

In [None]:
data['age_first_milestone_year'] = data['age_first_milestone_year'].fillna(0)
data['age_last_milestone_year'] = data['age_last_milestone_year'].fillna(0)

**Add new feature 'Age'.**

In [None]:
data['closed_at'] = pd.to_datetime(data['closed_at'])
data['founded_at'] = pd.to_datetime(data['founded_at'])
#convert to datetime data

data['last_date']=data['closed_at'] #copy data
data['last_date']=data['last_date'].fillna('2013-12-31')
data['last_date']=pd.to_datetime(data['last_date'])

We fill column last_date with 2013-12-31 with assumption that is the last number of dataset 

In [None]:
data["founded_at"] = pd.to_datetime(data["founded_at"])

data["age"] = (data["last_date"]-data["founded_at"])
data["age"]=round(data.age/np.timedelta64(1,'Y'))

If we check it, there is minus number in here. Drop minus number

In [None]:
data[[ 'age', 'age_first_funding_year','age_last_funding_year', 'age_first_milestone_year',
       'age_last_milestone_year']].sort_values('age').head()

In [None]:
data=data.drop(data[data.age<0].index)
data=data.drop(data[data.age_first_funding_year<0].index)
data=data.drop(data[data.age_last_funding_year<0].index)
data=data.drop(data[data.age_first_milestone_year<0].index)
data=data.drop(data[data.age_last_milestone_year<0].index)

Now we check for distribution, there is some features with skewness disribution more than 2. Handling it with normalization.

In [None]:
features = ['age_first_funding_year', 'relationships','funding_total_usd',
            'age_last_funding_year','age_first_milestone_year', 
            'age_last_milestone_year', 'funding_rounds', 
            'milestones','avg_participants', 'age']
data[features].skew(axis=0, skipna=True)>2

In [None]:
norm = ['age_first_funding_year', 'relationships', 'funding_total_usd']
data = data
for var in norm:
    data['norm_'+var]=MinMaxScaler().fit_transform(data[var].values.reshape(len(data),1))

# **Data Visualization**

In [None]:
data_grp_3=data[data['labels']==1].groupby(['age']).agg({'labels':'count'}).reset_index()
data_grp_3.columns=['age','total_succes']

data_grp_4=data.groupby(['age']).agg({'labels':'count'}).reset_index()
data_grp_4.columns=['age','total']

data_grp_3=data_grp_3.merge(data_grp_4,
                           on='age')
data_grp_3['succes_rate']=round((data_grp_3['total_succes']/data_grp_3['total'])*100,2)

data_grp_3


In [None]:
fig, ax = plt.subplots(figsize=(15,7))

g = sns.barplot(x = 'age',y='succes_rate',data=data_grp_3,ax=ax, 
               palette=sns.color_palette("Blues_d", n_colors=13, desat=1))

x = np.arange(len(data_grp_3))
y = data_grp_3['succes_rate']

for i, v in enumerate(y):
    ax.text(x[i]- 0.1, v+3, str(v)+'%', fontsize = 12, color='gray', fontweight='bold')
    
title = '''

'''
ax.text(2.80,30,title,horizontalalignment='left',color='black',fontsize=12,fontweight='bold')
    

text = '''

'''
ax.text(0.5,50,text,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')
    
ax.set_ylim(0,100)

ax.set_xticklabels(ax.get_xticklabels(),rotation=0);
plt.tight_layout

1. Business insight from age : startups that have lifespan of more than 4 years have a tendency to be successful startup (more than 52%)

In [None]:
data_grp_5=data[data['labels']==1].groupby(['milestones']).agg({'labels':'count'}).reset_index()
data_grp_5.columns=['milestones','total_succes']

data_grp_6=data.groupby(['milestones']).agg({'labels':'count'}).reset_index()
data_grp_6.columns=['milestones','total']

data_grp_5=data_grp_5.merge(data_grp_6,
                           on='milestones')
data_grp_5['succes_rate']=round((data_grp_5['total_succes']/data_grp_5['total'])*100,2)

data_grp_5

In [None]:
fig, ax = plt.subplots(figsize=(15,7))

g = sns.barplot(x = 'milestones',y='succes_rate',data=data_grp_5,ax=ax, 
               palette=sns.color_palette("Blues_d", n_colors=13, desat=1))

x = np.arange(len(data_grp_5))
y = data_grp_5['succes_rate']

for i, v in enumerate(y):
    ax.text(x[i]- 0.1, v+3, str(v)+'%', fontsize = 12, color='gray', fontweight='bold')
    
title = '''

'''
ax.text(2.80,30,title,horizontalalignment='left',color='black',fontsize=12,fontweight='bold')
    

text = '''

'''
ax.text(0.5,50,text,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')
    
ax.set_ylim(0,100)

ax.set_xticklabels(ax.get_xticklabels(),rotation=0)
plt.tight_layout

2. Business insight from milestone : Startups that have min 1 milestone has potential to be successful startup (more than 60%)

In [None]:
data['Relationships Range'] = data['relationships'].apply(lambda x : 'relationship 0' if x==0 else 'relationships >10' if x>10 else 'relationships 1-10' )
data20 = data.groupby(['Relationships Range', 'labels']).agg({'id' : 'count'}).reset_index()
data20_pv = pd.pivot_table(data20,
                          index=['Relationships Range'],
                          columns=['labels'],
                          values=['id']).reset_index()
data20_pv.columns = ['Relationships Range', 'Closed', 'Acquired']
data20_pv['Total Company'] = data20_pv['Closed']+data20_pv['Acquired']
data20_pv['Success Rate'] = round(data20_pv['Acquired']/data20_pv['Total Company']*100,2)
data20_pv


In [None]:
fig, ax = plt.subplots(figsize=(8,6))

g = sns.barplot(x = 'Relationships Range',y='Success Rate',data=data20_pv,ax=ax, 
               palette=sns.color_palette("Blues_d", n_colors=13, desat=1))

x = np.arange(len(data20_pv['Relationships Range']))
y = data20_pv['Success Rate']

for i, v in enumerate(y):
    ax.text(x[i]- 0.1, v+3, str(v)+'%', fontsize = 20, color='gray', fontweight='bold')
  
ax.set_xticklabels(ax.get_xticklabels(),rotation=0);

3. Business insigt from relationship: startups with relationships more than 1 has potential to be successful startup (more than 61%)

# **Training and Test Data**

In [None]:
# Split Feature Vector and Label
X = data[['norm_relationships', 'norm_age_first_funding_year','norm_funding_total_usd',
          
          'age_last_funding_year',
          'age_first_milestone_year', 'age_last_milestone_year', 
          'funding_rounds', 'milestones','age',

          'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 
          'is_software', 'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 
          'is_ecommerce', 'is_biotech', 'is_consulting','is_othercategory', 
          'has_VC', 'has_angel', 'has_roundA','has_roundB', 'has_roundC', 'has_roundD', 
          'avg_participants','is_top500'
          ]]
y = data['labels'] # target / label

#Splitting the data into Train and Test
X_train, X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size = 0.3,
                                                random_state = 42)
# Oversampling
X_train, y_train = over_sampling.RandomOverSampler(random_state=42).fit_resample(X_train, y_train)


### Modeling with AdaBoost

In [None]:
ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train, y_train)
y_predicted = ab.predict(X_test)
y_predicted_train = ab.predict(X_train)

print('\nconfusion matrix') # generate the confusion matrix
print(confusion_matrix(y_test, y_predicted))
print('\naccuracy')
print(accuracy_score(y_test, y_predicted))
print('\nclassification report')
print(classification_report(y_test, y_predicted)) # generate the precision, recall, f-1 score, num
roc_auc_score(y_test, y_predicted)

regression = AdaBoostClassifier(random_state=42)
regression.fit(X_train, y_train)
print("Train Accuracy:",regression.score(X_train, y_train))
print("Test Accuracy:",regression.score(X_test, y_test))

roc_auc_score(y_test, y_predicted)
print('AUC Score:',roc_auc_score(y_test, y_predicted))


# **Features Importances**

In [None]:
feat_importances = pd.Series(ab.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(10).plot(kind='barh')
ax.invert_yaxis()
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

# **Business Simulation**

In [None]:
y_predicted=pd.DataFrame(y_predicted)
y_test=pd.DataFrame(y_test)

y_test=y_test.reset_index()
y_test=y_test.drop(['index'],axis=1)

X_test['funding_total_usd']=data['funding_total_usd']
X_test=X_test.reset_index()
X_test=X_test.drop(['index'],axis=1)

X_test['y_predicted']=y_predicted
X_test['y_test']=y_test
X_test.head()

Total Fail in start up:

In [None]:
y_test[y_test['labels']==0].count()

Predict True Fail Startup (True Negatif):

In [None]:
X_test[(X_test['y_test']==0)&(X_test['y_predicted']==0)]['y_predicted'].count()

Total Success in startup:

In [None]:
y_test[y_test['labels']==1].count()

Predict True Success Startup(True Positif):

In [None]:
X_test[(X_test['y_test']==1)&(X_test['y_predicted']==1)]['y_predicted'].count()

**Total Invest without ML:**

In [None]:
X_test['funding_total_usd'].sum()

If we Invest without ML AdaBoost it will cost 5 Billion USD

**Total Invest with ML:**

In [None]:
X_test[(X_test['y_predicted']==1)]['funding_total_usd'].sum()

If we invest with AdaBoost, it will cost 3.3 Billion USD. It's 34% effieciency invesment

**Potential Loss without ML:**

In [None]:
X_test[(X_test['y_test']==0)]['funding_total_usd'].sum()

Without AdaBoost, potential loss investment is 1.8 Billion USD

**Potential Loss With ML:**

In [None]:
X_test[(X_test['y_test']==0)&(X_test['y_predicted']==1)]['funding_total_usd'].sum()

With AdaBoost, potential loss is reducting until 82%. It just cost 300 Million USD

**Saving Fund Investment with predict ML:**

In [None]:
X_test[(X_test['y_test']==0)&(X_test['y_predicted']==0)]['funding_total_usd'].sum()

With ML AdaBoost, we are saving 1.5 Billion USD