In [1]:
#!pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_pickle('crunchbase_data/cleaned_startup_data.pkl')
print(df.info())
print(df.columns)
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27220 entries, 0 to 27219
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   permalink               27220 non-null  object             
 1   name                    27219 non-null  object             
 2   homepage_url            25447 non-null  object             
 3   category_list           26824 non-null  object             
 4   funding_total_usd       22952 non-null  float64            
 5   status                  27220 non-null  object             
 6   country_code            27220 non-null  object             
 7   state_code              27192 non-null  object             
 8   region                  27152 non-null  object             
 9   city                    27152 non-null  object             
 10  funding_rounds          27220 non-null  int64              
 11  founded_at              27220 non-null  d

## Filter Columns for X 
### Drop any columns that will not be a part of the X values. This includes any columns used to define is_success such as the status columns and anything from the aquired df. 

#### Some higher cardinality columns or datetime columns are dropped temporality for simplicity

In [4]:
# Drop columns that should not be a part of X values
# This includes columsn used to define isSuccess
df = df.drop(['permalink', 'name' , 'homepage_url', 'country_code', 'status', '2014_status',
              'acquirer_name','acquirer_category_list', 'acquirer_country_code',
              'acquirer_state_code', 'acquirer_region', 'acquirer_city',
              'acquired_at', 'acquired_month', 'price_amount', 'price_currency_code'], axis = 1)
df.columns

Index(['category_list', 'funding_total_usd', 'state_code', 'region', 'city',
       'funding_rounds', 'founded_at', 'first_funding_at', 'last_funding_at',
       '2014_funding_total_usd', '2014_funding_rounds', '2014_last_funding_at',
       'is_success'],
      dtype='object')

In [5]:

#drop for testing  add datetime cols: ['founded_at', 'first_funding_at','last_funding_at',  '2014_last_funding_at']
df = df.drop(['category_list',
       'city', 'founded_at', 'first_funding_at',
       'last_funding_at',
        '2014_last_funding_at'], axis = 1)
# add ID 
# df.reset_index(inplace=True)
# df.rename(columns={'index': 'id'}, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 27220 entries, 0 to 27219
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   funding_total_usd       22952 non-null  float64
 1   state_code              27192 non-null  object 
 2   region                  27152 non-null  object 
 3   funding_rounds          27220 non-null  int64  
 4   2014_funding_total_usd  18797 non-null  float64
 5   2014_funding_rounds     27220 non-null  int64  
 6   is_success              27220 non-null  int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 1.7+ MB


In [6]:
df.columns


Index(['funding_total_usd', 'state_code', 'region', 'funding_rounds',
       '2014_funding_total_usd', '2014_funding_rounds', 'is_success'],
      dtype='object')

## One hot encode the low cardinality categorical columns 

In [7]:
# columns_to_encode = ['status', 'state_code', 'region','2014_status',
#         'acquirer_country_code', 'acquirer_state_code',
#        'acquirer_region', 'acquired_month',
#        'price_currency_code']

columns_to_encode = ['state_code', 'region']
df_encoded = pd.get_dummies(df, columns=columns_to_encode)
df_encoded

Unnamed: 0,funding_total_usd,funding_rounds,2014_funding_total_usd,2014_funding_rounds,is_success,state_code_AK,state_code_AL,state_code_AR,state_code_AZ,state_code_CA,...,region_WI - Other,region_WV - Other,region_WY - Other,"region_Washington, D.C.",region_Wichita,"region_Wilmington - Cape Fear, North Carolina","region_Wilmington, Delaware",region_Winston-Salem,region_Worcester,region_Youngstown
0,,1,,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33600000.0,4,10600000.0,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,,1,,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6000000.0,1,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5000000.0,1,,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27215,25000000.0,1,25000000.0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
27216,34275015.0,4,34275015.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27217,3300000.0,2,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27218,2150000.0,2,2150000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_encoded.shape

(27220, 297)

In [9]:
# Check how many sucessful start ups there are
df_encoded['is_success'].value_counts()

0    23581
1     3639
Name: is_success, dtype: int64

### Set the target 'is_success' column to y and all other columns to X

In [10]:
X = df_encoded.drop('is_success', axis=1)
y = df.loc[:, ['is_success']]
print(f'Shape of X : {X.shape}')
print(f'Shape of y : {y.shape}')


Shape of X : (27220, 296)
Shape of y : (27220, 1)


#### Make function to have cross validation

In [11]:
def cross_val(classifier, num_splits=10):
    model = classifier
    scores = cross_val_score(model, X, y, cv=num_splits)
    
    print(np.round(scores, 9))
    print()
    print(scores.mean())


### Run and evaluate the performance of the XGBClassifier

In [13]:
cross_val(XGBClassifier(n_estimators=5))


[0.86664217 0.8662748  0.8662748  0.86590742 0.8662748  0.8662748
 0.8662748  0.8662748  0.8662748  0.8662748 ]

0.8662747979426892


## Next Steps:
Two Options

### 1. Fine tune model by adding more cateogrical info
* There is not too much more data I can use ( only can use category/industry of startup, location, when it was founded)
* I could add investment data (would add one more numeric value: amount_raised from investment, and 3 - 4 categorical values )




### 2. Use PrivCo Data

Pros:
* A lot more columns of numerical data (esp finanical data)   
* More recent data   

Cons:
* Less rows of data (~6000)   
* Difficult to create an is_sucessful columns to use as target for a classification model   
* Time, would need to redo same cleaningn EDA Process for data


If switched to PrivCo it may be better to do a linear regression to predict the lastest valuation of the startup, or another metric 