# Predicting successfull startups


###**Import of data and relevant libaries**

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import graphviz
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold,RFECV
from sklearn.inspection import permutation_importance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict,RepeatedStratifiedKFold,StratifiedKFold
from imblearn.over_sampling import SMOTE

startup_df = pd.read_csv("../input/startup-investments-crunchbase/investments_VC.csv", delimiter=',', header = 0, encoding = "ISO-8859-1",skipinitialspace=True)
startup_df = startup_df.rename(columns={'market ': 'market', 'funding_total_usd ': 'funding_total_usd'})


###**1.General overview**

In [None]:
startup_df.info()

In [None]:
#Based on the .info() function, we can identify that some data variables need to be converted into a date type.
format_date = ('founded_at','first_funding_at','last_funding_at')
for i in format_date:
  startup_df[i] = pd.to_datetime(startup_df[i], format = '%Y-%m-%d', errors = 'coerce')

In [None]:
startup_df.info()

In [None]:
startup_df.describe()

###**2.Initial Data Preparation**
Preparing for exploration

In [None]:
startup_df.head(1)

In [None]:
#To reduce the complexity, while processing the date variable fields, we convert the actual dates to 
#relative dates, describing the distance from founded to first and last funding and then drop the founded date.

startup_df['age_first_funding'] = (startup_df['first_funding_at']-startup_df['founded_at'])/pd.Timedelta(days=365)
startup_df['age_last_funding'] = (startup_df['last_funding_at']-startup_df['founded_at'])/pd.Timedelta(days=365)
startup_df['founded_month'] = pd.DatetimeIndex(startup_df['founded_at']).month
startup_df['founded_quarter'] = pd.DatetimeIndex(startup_df['founded_at']).quarter


In [None]:
startup_df.isnull().sum()

In [None]:
#As the dataset contains quite many null-values, we try to reduce the amount null-values by
#deleting the rows with a few central variables containing null
startup_df = startup_df.drop(startup_df[
                                        (
                                        (startup_df['status'].isna())|
                                        (startup_df['founded_month'].isna())|
                                        (startup_df['founded_year'].isna())|
                                        (startup_df['market'].isna())|
                                        (startup_df['country_code'].isna())|
                                        (startup_df['funding_total_usd'].isna())|
                                        (startup_df['age_first_funding'].isna())
                                        )
                                        ].index)
#With a general removal of all observations containing null-values, we have 28290 observations remaining
startup_df.shape

In [None]:
#Based on the cleaning above, we would now like to see, 
#how many observations for each column that contains null-values.

startup_df.isnull().sum()

In [None]:
#Excluding irrelevant variables, assumed cannot be used to predict the success of a start-up
#Funding_total_USD is excluded, as this is an aggregation of angel and seed

#We, however keep the name of the startup to correctly distinct between them the futher analysis
startup_df = startup_df.drop(
    [
     'permalink'
#     ,'name'
     ,'homepage_url'
     ,'state_code'
     ,'region'
     ,'city'
     ,'founded_at'
     ,'first_funding_at'
     ,'funding_total_usd'
     ,'last_funding_at'
     ,'category_list'
     ], axis = 1)

In [None]:
#Further, we can check that we do not have any duplicates in our dataset
startup_df = startup_df.drop_duplicates()
startup_df.shape

In [None]:
print('Before')
display(startup_df.groupby('status').agg({'country_code':'count'}).sort_values(by=['status'], ascending = False))
StatusDict = {"closed":0,"acquired":1}

startup_df = startup_df[(startup_df['status'] == 'acquired') | (startup_df['status'] == 'closed')]
startup_df["status_binary"] = startup_df["status"].map(StatusDict)
startup_df = startup_df.drop(['status'], axis = 1)

print('After')
display(startup_df.groupby('status_binary').agg({'country_code':'count'}).sort_values(by=['status_binary'], ascending = True))


In [None]:
#Hence, the format of funding_total_USD is distorted, we are aggregating a new attribute 

startup_df['total_investment'] = startup_df['seed'] + startup_df['venture'] +startup_df['equity_crowdfunding'] + startup_df['undisclosed'] + startup_df['convertible_note'] + startup_df['debt_financing'] + startup_df['angel'] + startup_df['grant'] + startup_df['private_equity'] + startup_df['post_ipo_equity'] + startup_df['post_ipo_debt'] + startup_df['secondary_market'] + startup_df['product_crowdfunding']

startup_df['total_investment'].describe()



In [None]:
max(startup_df['total_investment'])

###**3. Data Preparation**
Preparing for modelling


In [None]:
#Exporting data to .CSV in order to explore data in Tableau
#startup_df.to_csv(r'startup_df_v1.csv', index = False, sep=';',mode='w')
#files.download('startup_df_v1.csv')

In [None]:
startup_df.shape

#### 3.1 Funding sum and rounds

In [None]:
#Removing outliers via IQR: Interquartile range, also called midspread.

Q1 = startup_df['total_investment'].quantile(0.25)
Q3 = startup_df['total_investment'].quantile(0.75)
IQR = Q3 - Q1

fund_lower = (Q1 - 1.5 * IQR)
fund_upper = (Q3 + 1.5 * IQR)

startup_df = startup_df[
                        (startup_df['total_investment'] >= fund_lower ) 
                        & (startup_df['total_investment'] <= fund_upper)
                        ]

print('The following code remove all datapoints below: {} and above {}'.format(fund_lower, fund_upper))
print('This results in the following data model: {}'.format(startup_df.shape))

#### 3.2 Markets


In [None]:
top20_markets = startup_df['market'].value_counts()[:20].keys().tolist()
startup_df['market'] = startup_df['market'].apply(lambda i: i if i in top20_markets else 'Other')

#### 3.3 Age when funded

In [None]:
#It looks like, some startups have recived funding prior to the date of establishment.

#startup_df['age_first_funding'] = startup_df['age_first_funding'].clip(lower=0, upper=None)
#startup_df['age_last_funding'] = startup_df['age_last_funding'].clip(lower=0, upper=None)


startup_df.loc[startup_df['age_first_funding'] < 0, 'age_first_funding'] = 0
startup_df.loc[startup_df['age_last_funding'] < 0, 'age_first_funding'] = 0

#### 3.4 Age when founded


In [None]:
#The majority of the startups are located after 1994
#To further reduce the amount of outliers, we are excluding all startups before 1995.

startup_df = startup_df[(startup_df['founded_year'] >= 1995.0 )]

#### 3.5 Geographical data

In [None]:
startup_df = startup_df.drop(
    [
     'country_code'
     ,'name'
     ], axis = 1)

#### 3.6 Correlation matrix


In [None]:
plt.figure(figsize=(30,30))
startup_heat = startup_df.corr()

startup_heat= sns.heatmap(startup_heat, annot=True,linewidth = 0.5, cmap='coolwarm', vmin=-1, vmax=1)

bottom, top = startup_heat.get_ylim()
startup_heat.set_ylim(bottom, top)

plt.show()

In [None]:
#Hence, the correlation of a few different attributes are blank
#we are further excluding these from the model
startup_df = startup_df.drop(
    [
     'post_ipo_equity'
     ,'post_ipo_debt'
     ,'round_G'
     ,'round_H'
     ,'founded_year'
     ,'founded_quarter'
     ,'total_investment'
     ,'age_last_funding'
     ], axis = 1)

plt.figure(figsize=(30,30))
startup_heat = startup_df.corr()

startup_heat= sns.heatmap(startup_heat, annot=True,linewidth = 0.5, cmap='coolwarm', vmin=-1, vmax=1)

bottom, top = startup_heat.get_ylim()
startup_heat.set_ylim(bottom, top)

plt.show()

#### 3.7 Exporting data to Tableau



In [None]:
#Exporting data to .CSV in order to explore data in Tableau
#startup_df.to_csv(r'startup_df_v2.csv', index = False, sep=';',mode='w')
#files.download('startup_df_v2.csv')

###**4. Data Modelling and Tuning**
Predicting successful start-ups


In [None]:
#The initial process of the modelling phase is to create dummy variables
#and seperating the dataset into X and y

modelling_df = startup_df
X = pd.get_dummies(modelling_df.drop(['status_binary'],axis = 1))
y = modelling_df['status_binary']

In [None]:
#Before modelling our data, we are creating a new dataframe, to reduce the complexity
#Further, we are the data into train and test identify potential overfitting patterns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 10)

#### 4.1 Random Forrest Classification




In [None]:
#Testing the impact of increasing desicion trees used to predict y.
n = 10
while n <= 200:
  model = RandomForestClassifier(n_estimators=n)
  model.fit(X_train, y_train)
  print('{} - train score: {:.3f} | test score: {:.3f}'.format(n,model.score(X_train,y_train),model.score(X_test,y_test)))
  n = n+10

In [None]:
model = RandomForestClassifier(n_estimators=110) 
model.fit(X_train, y_train)

#ROC CURVE - Plotting the true positives against the false positives and an AUC score
y_pred_proba = model.predict_proba(X_test) [:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()

##### 4.2.1 Model tuning

In [None]:
#To get an overview of the feature importance we can utilize from the 
#The feature_importances_ function was not used, because of bias when dealing with a high cardinality (according to the SKlearn documentation)
result = permutation_importance(model, X_train, y_train, n_repeats=10,random_state=0)
fimp = pd.Series(result.importances_mean,index=X_train.columns.values).sort_values(ascending=False)
fimp

In [None]:
#Based on the model accuracy from above, we can try to tune the performance by
#evaluating the accuracy by random selecting a number of decriptive attributes
#Further, we evaluate the model by the isolated attribute importance and then dropping insignificant attributes.

rfecv = RFECV(estimator=model, step=1,cv=StratifiedKFold(10),scoring='accuracy')
rfecv.fit(X_train,y_train)

plt.figure(figsize=(9,5))
plt.plot(range(1, len(rfecv.grid_scores_)+1),rfecv.grid_scores_,linewidth=3)
plt.title('The correlation between the number of attributes and the accuracy of the model')
plt.show()



In [None]:
X_train.columns[np.where(rfecv.support_ == False)[0]]

In [None]:
#Based on the plot above, we are dropping all columns not categorized as supportive to our model
X.drop(X_train.columns[np.where(rfecv.support_ == False)[0]],axis=1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
model.fit(X_train,y_train)

In [None]:
result = permutation_importance(model, X_train, y_train, n_repeats=10,random_state=0)
fimp = pd.Series(result.importances_mean,index=X_train.columns.values).sort_values(ascending=False)
fimp

In [None]:
#Plotting feature importance
sns.barplot(x=fimp, y=fimp.index, color='b')
plt.rcParams['figure.figsize'] = 6,15
plt.xlabel('Importance score')
plt.ylabel('Attribute')
plt.title('Attribute importance')

plt.show()

In [None]:
#Plotting another ROC curve to benchmark the initial model against the tuned version.
y_pred_proba = model.predict_proba(X_test) [:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()

In [None]:
#Further, we create a new model, embedded with a cross-validation to evaluate the performance of the model across 10 folds repeating 5 times
cross_val = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cross_val, n_jobs=-1, error_score='raise')
# report performance
print('Average score of 5 run with a stratified 10 Kfold is {:.3f} with a standard deviation of: {:.3f}'.format(np.mean(scores)*100, np.std(scores)))

In [None]:
#Finally, we can try to use our model to predict a the future state of a startup in isolation
#To do this, we can make use of to predict function, but first we need to create a new dataframe with our target

pred = X_test.sample(n=1)
outcome = model.predict(pred)
print('Predicted Class: {}'.format(outcome[0]))

if outcome == 1:
  print('This startup is predicted to be successful by {:.1f}%'.format(np.mean(scores)*100)) 
  display(pred)
elif outcome == 0:
  print('This startup is predicted NOT to be successful by {:.1f}%'.format(np.mean(scores)*100)) 
  display(pred)
#To use this in a real life setting, one could benefit from this by replacing 'pred' with another dataframe
#containing a startup with that has the potential of being a successfull startup and hereby also needed to be investigated

#### 4.2 Logistic regression




In [None]:
#To evaluate the true performance of a logistic regression, we are once again including all attributes
X = pd.get_dummies(modelling_df.drop(['status_binary'],axis = 1))
y = modelling_df['status_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

logreg = LogisticRegressionCV(max_iter=10000, scoring='roc_auc')
logreg.fit(X_train,y_train)

print('The training model accuracy: {:.4}'.format(logreg.score(X_train,y_train)))
print('The test model accuracy: {:.4}'.format(logreg.score(X_test,y_test)))

In [None]:
#ROC CURVE - Plotting the true positives against the false positives
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()

##### 4.1.1 Model tuning

In [None]:
#Trying to balance the classes in order to achieve a better result
logreg = LogisticRegressionCV(max_iter=10000,scoring='roc_auc',class_weight='balanced')
logreg.fit(X_train,y_train)

print('The training model accuracy: {:.4}'.format(logreg.score(X_train,y_train)))
print('The test model accuracy: {:.4}'.format(logreg.score(X_test,y_test)))

In [None]:
#Fitting the model with both a balanced and the liblinear as the solver
#**Liblinear is described in docu as a solid choice for small datasets**
logreg = LogisticRegressionCV(penalty='l1', solver='liblinear',scoring='roc_auc',class_weight='balanced', max_iter=100000)
logreg.fit(X_train,y_train)

print('The training model accuracy: {:.4}'.format(logreg.score(X_train,y_train)))
print('The test model accuracy: {:.4}'.format(logreg.score(X_test,y_test)))

In [None]:
#Fitting the model with both a balanced and 
#from sklearn.metrics import mean_squared_error
logreg = LogisticRegressionCV(penalty='l1', solver='liblinear', max_iter=10000,scoring='roc_auc')
logreg.fit(X_train,y_train)

print('The training model accuracy: {:.4}'.format(logreg.score(X_train,y_train)))
print('The test model accuracy: {:.4}'.format(logreg.score(X_test,y_test)))

#print('Mean square error: {:.4}'.format(mean_squared_error(y_test, y_train)))
#print('Root mean square error: {:.4}'.format(mean_squared_error(y_test, y_train)))

In [None]:
#ROC CURVE - Plotting the true positives against the false positives
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()

###**5. Data Evaluation**
Predicting successful start-ups


In [None]:
modelling_df.groupby('status_binary').agg({'status_binary':'count'})

*Hence our dataset is inbalanced, we can apply SMOTE to balance the difference between successfull and non-successfull startups. SMOTE is using knearestneighbor, to create new rows by not copying but rather simulating existing rows.*


*We can the try to apply SMOTE to the training data and they carry out the two ML models from the previous chapter once again.*

*However, it is important, that we only do a SMOTE on the training data, because information from the target variable then is restricted to only training and the real performance of the test is then isolated.*

In [None]:
modelling_df = startup_df
X = pd.get_dummies(modelling_df.drop(['status_binary'],axis = 1))
y = modelling_df['status_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0)



In [None]:
oversample = SMOTE()

os_X,os_y =oversample.fit_resample(X_train, y_train)
os_X = pd.DataFrame(data= os_X,columns = col )
os_y = pd.DataFrame(data= os_y)

In [None]:
os_y

In [None]:
#Then we can evaluate the SMOTE function, to understand how we have extended our data

print("length of undersampled data is ",len(os_X))
print("Number of closed startups in undersampled data",len(os_y[os_y['status_binary']==0]))
print("Number of succesfull startups",len(os_y[os_y['status_binary']==1]))
print("Proportion of closed startups in undersampled data is {}%".format(len(os_y[os_y['status_binary']==0])/len(os_X)*100))
print("Proportion of succesfull startups in undersampled data is {}%".format(len(os_y[os_y['status_binary']==1])/len(os_X)*100))

##### 5.3 Random forrest w. balanced data

In [None]:
X = os_X
y = os_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0)

In [None]:
model = RandomForestClassifier(n_estimators=110) 
model.fit(X_train, y_train)

#ROC CURVE - Plotting the true positives against the false positives and an AUC score
y_pred_proba = model.predict_proba(X_test) [:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()


> Hence, we did not recived great success while tuning the random forrest model from chapter 3, we leave the model by it's original state and do not exclude attributes by performing a RFECV (Recursive Feature Elimination.Cross-alidation)




##### 5.2 Logistic regression w. balanced data

In [None]:
logreg = LogisticRegressionCV(penalty='l1', solver='liblinear', max_iter=10000, scoring='roc_auc')
logreg.fit(os_X,os_y)

print('The training model accuracy (balanced): {:.4} %'.format(logreg.score(os_X,os_y)*100))
print('The test model accuracy: {:.4}'.format(logreg.score(X_test,y_test)*100))

In [None]:
#ROC CURVE - Plotting the true positives against the false positives
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc=4)
plt.show()

###**5. Deployment**
The deployment of a random forrest model to classify successfull startups.


In [None]:
#Finally, we can try to use our model to predict a the future state of a startup in isolation
#To do this, we can make use of to predict function, but first we need to create a new dataframe with our target

#Training the model once again with the balanced data from SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0)

model.fit(X_train, y_train)
pred = X_test.sample(n=1)
outcome = model.predict(pred)

y_pred_proba = model.predict_proba(X_test) [:,1]
auc = metrics.roc_auc_score(y_test, y_pred_proba)

print('Predicted Class: {}'.format(outcome[0]))

if outcome == 1:
  print('This startup is predicted to be successful by {:.1f}%'.format(auc*100)) 
  display(pred)
elif outcome == 0:
  print('This startup is predicted NOT to be successful by {:.1f}%'.format(auc*100)) 
  display(pred)

#To use this in a real life setting, one could benefit from this by replacing 'pred' with another user based dataframe
#containing a startup with that has the potential of being a successfull startup and hereby also needed to be investigated
#This is shown below

Tableau Software was recently acquired by Salesforce. However, they are categorized as a operational company in our dataset. To test the performance of our model, we can try see how the model perform on Tableau:

In [None]:
prod_dict = {'funding_rounds':              [2.0]
            ,'founded_month':               [1.0]
            ,'seed':                        [0.0]
            ,'venture':                     [15000000.0]
            ,'equity_crowdfunding':         [0.0]
            ,'undiscolsed':                 [0.0]
            ,'convertible_note':            [0.0]
            ,'debt_financing':              [0.0]
            ,'angel':                       [0.0]
            ,'grant':                       [0.0]
            ,'private equity':              [0.0]
            ,'secondary_market':            [0.0]
            ,'product_crowdfunding':        [0.0]
            ,'round_A':                     [5000000.0]
            ,'round_B':                     [10000000.0]
            ,'round_C':                     [0.0]
            ,'round_D':                     [0.0]
            ,'round_E':                     [0.0]
            ,'round_F':                     [0.0]
            ,'age_first_funding':           [1.0]
            ,'market_Advertising':          [0.0]
            ,'market_Analytics ':           [0.0]
            ,'market_Biotechnology ':       [0.0]
            ,'market_Clean Technology ':    [0.0]
            ,'market_Curated Web ':         [0.0]
            ,'market_E-Commerce ':          [0.0]
            ,'market_Education':            [0.0]
            ,'market_Enterprise Software ': [0.0]
            ,'market_Finance ':             [0.0]
            ,'market_Games ':               [0.0]
            ,'market_Hardware + Software ': [0.0]
            ,'market_Health care ':         [0.0] 
            ,'market_Messaging ':           [0.0]
            ,'market_Mobile ':              [0.0]
            ,'market_Other':                [0.0]
            ,'market_Search ':              [0.0]
            ,'market_Security':             [0.0]
            ,'market_Semiconductors ':      [0.0]
            ,'market_Social Media ':        [0.0]
            ,'market_Software ':            [1.0]
            ,'market_Web Hosting ':         [0.0]
            }
prod_pred = pd.DataFrame(data=prod_dict)
outcome = model.predict(prod_pred)

print('Predicted Class: {}'.format(outcome[0]))

if outcome == 1:
  print('This startup is predicted to be successful by {:.1f}%'.format(auc*100)) 
elif outcome == 0:
  print('This startup is predicted NOT to be successful by {:.1f}%'.format(auc*100)) 
display(prod_pred)

Slack software

In [None]:
prod_dict = {'funding_rounds':              [6.0]
            ,'founded_month':               [1.0]
            ,'seed':                        [1500000.0]
            ,'venture':                     [178450000.0]
            ,'equity_crowdfunding':         [0.0]
            ,'undiscolsed':                 [0.0]
            ,'convertible_note':            [0.0]
            ,'debt_financing':              [0.0]
            ,'angel':                       [0.0]
            ,'grant':                       [0.0]
            ,'private equity':              [0.0]
            ,'secondary_market':            [0.0]
            ,'product_crowdfunding':        [0.0]
            ,'round_A':                     [5000000.0]
            ,'round_B':                     [10700000.0]
            ,'round_C':                     [42750000.0]
            ,'round_D':                     [120000000.0]
            ,'round_E':                     [0.0]
            ,'round_F':                     [0.0]
            ,'age_first_funding':           [0.0]
            ,'market_Advertising':          [0.0]
            ,'market_Analytics ':           [0.0]
            ,'market_Biotechnology ':       [0.0]
            ,'market_Clean Technology ':    [0.0]
            ,'market_Curated Web ':         [0.0]
            ,'market_E-Commerce ':          [0.0]
            ,'market_Education':            [0.0]
            ,'market_Enterprise Software ': [1.0]
            ,'market_Finance ':             [0.0]
            ,'market_Games ':               [0.0]
            ,'market_Hardware + Software ': [0.0]
            ,'market_Health care ':         [0.0] 
            ,'market_Messaging ':           [0.0]
            ,'market_Mobile ':              [0.0]
            ,'market_Other':                [0.0]
            ,'market_Search ':              [0.0]
            ,'market_Security':             [0.0]
            ,'market_Semiconductors ':      [0.0]
            ,'market_Social Media ':        [0.0]
            ,'market_Software ':            [0.0]
            ,'market_Web Hosting ':         [0.0]
            }
prod_pred = pd.DataFrame(data=prod_dict)
outcome = model.predict(prod_pred)

print('Predicted Class: {}'.format(outcome[0]))

if outcome == 1:
  print('This startup is predicted to be successful by {:.1f}%'.format(auc*100)) 
elif outcome == 0:
  print('This startup is predicted NOT to be successful by {:.1f}%'.format(auc*100)) 
display(prod_pred)

Jawbone - a consumer hardware producer, that shut down their operations and currently undergoing a liquidation because of several lawsuits from creditors

In [None]:
prod_dict = {'funding_rounds':              [11.0]
            ,'founded_month':               [1.0]
            ,'seed':                        [0.0]
            ,'venture':                     [188799995.0]
            ,'equity_crowdfunding':         [0.0]
            ,'undiscolsed':                 [0.0]
            ,'convertible_note':            [0.0]
            ,'debt_financing':              [93000000.0]
            ,'angel':                       [0.0]
            ,'grant':                       [0.0]
            ,'private equity':              [237000000.0]
            ,'secondary_market':            [0.0]
            ,'product_crowdfunding':        [0.0]
            ,'round_A':                     [5000000.0]
            ,'round_B':                     [70000000.0]
            ,'round_C':                     [0.0]
            ,'round_D':                     [0.0]
            ,'round_E':                     [63999995.0]
            ,'round_F':                     [0.0]
            ,'age_first_funding':           [7.95]
            ,'market_Advertising':          [0.0]
            ,'market_Analytics ':           [0.0]
            ,'market_Biotechnology ':       [0.0]
            ,'market_Clean Technology ':    [0.0]
            ,'market_Curated Web ':         [0.0]
            ,'market_E-Commerce ':          [0.0]
            ,'market_Education':            [0.0]
            ,'market_Enterprise Software ': [0.0]
            ,'market_Finance ':             [0.0]
            ,'market_Games ':               [0.0]
            ,'market_Hardware + Software ': [0.0]
            ,'market_Health care ':         [0.0] 
            ,'market_Messaging ':           [0.0]
            ,'market_Mobile ':              [0.0]
            ,'market_Other':                [1.0]
            ,'market_Search ':              [0.0]
            ,'market_Security':             [0.0]
            ,'market_Semiconductors ':      [0.0]
            ,'market_Social Media ':        [0.0]
            ,'market_Software ':            [0.0]
            ,'market_Web Hosting ':         [0.0]
            }
prod_pred = pd.DataFrame(data=prod_dict)
outcome = model.predict(prod_pred)

print('Predicted Class: {}'.format(outcome[0]))

if outcome == 1:
  print('This startup is predicted to be successful by {:.1f}%'.format(auc*100)) 
elif outcome == 0:
  print('This startup is predicted NOT to be successful by {:.1f}%'.format(auc*100)) 
display(prod_pred)