Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

Reading Dataset

In [3]:
df = pd.read_csv('startup data.csv')

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0,0,acquired


In [5]:
clmn=list(df.columns)

In [6]:
unique_values = df['status'].unique()
print(unique_values)

['acquired' 'closed']


**Data** **Preprocessing**

Handling Missing Values

In [7]:
# Drop rows with missing target or critical data
#df = df.dropna(subset=['status', 'founded_at'])
df = df.fillna(value=0)

# Fill other missing values
df['funding_total_usd'] = df['funding_total_usd'].fillna(0)


Drop unwanted Columns

In [8]:
df = df.drop(columns=['zip_code','labels','Unnamed: 0','state_code', 'id','city','Unnamed: 6', 'category_code','name','founded_at' ,'closed_at','object_id',	'first_funding_at',	'last_funding_at'	,'state_code.1'])

In [9]:
df.head(2)

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,...,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,42.35888,-71.05682,2.2493,3.0027,4.6685,6.7041,3,3,375000,3,...,1,0,1,0,0,0,0,1.0,0,acquired
1,37.238916,-121.973718,5.126,9.9973,7.0055,7.0055,9,4,40100000,1,...,0,1,0,0,1,1,1,4.75,1,acquired


In [10]:
df.columns

Index(['latitude', 'longitude', 'age_first_funding_year',
       'age_last_funding_year', 'age_first_milestone_year',
       'age_last_milestone_year', 'relationships', 'funding_rounds',
       'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX',
       'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise',
       'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech',
       'is_consulting', 'is_othercategory', 'has_VC', 'has_angel',
       'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'is_top500', 'status'],
      dtype='object')

Creating new feature

In [11]:
df['funding_velocity'] = df['funding_total_usd'] / (df['age_last_funding_year'] + 1)

Encoding target feature

In [12]:

# Label encode target
df['status'] = df['status'].map({'closed': 0, 'operating': 1, 'acquired': 1})


In [13]:
df.head(2)

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,...,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status,funding_velocity
0,42.35888,-71.05682,2.2493,3.0027,4.6685,6.7041,3,3,375000,3,...,0,1,0,0,0,0,1.0,0,1,93686.76
1,37.238916,-121.973718,5.126,9.9973,7.0055,7.0055,9,4,40100000,1,...,1,0,0,1,1,1,4.75,1,1,3646350.0


In [14]:
df.columns

Index(['latitude', 'longitude', 'age_first_funding_year',
       'age_last_funding_year', 'age_first_milestone_year',
       'age_last_milestone_year', 'relationships', 'funding_rounds',
       'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX',
       'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise',
       'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech',
       'is_consulting', 'is_othercategory', 'has_VC', 'has_angel',
       'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'is_top500', 'status', 'funding_velocity'],
      dtype='object')

In [15]:
col=['latitude'	,'longitude','age_first_funding_year',	'age_last_funding_year',	'age_first_milestone_year'	,'age_last_milestone_year'	,'relationships'	,'funding_rounds'	,'funding_total_usd',	'milestones','avg_participants','funding_velocity']

Normalizing features

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df[col]), columns=col)


In [17]:
normalized_df.head(1)

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,avg_participants,funding_velocity
0,0.494494,0.367152,0.365061,0.389409,0.484841,0.432611,0.047619,0.222222,6.4e-05,0.375,0.0,0.019686


In [18]:
df[col]=normalized_df

In [19]:
df.head(1)

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,...,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status,funding_velocity
0,0.494494,0.367152,0.365061,0.389409,0.484841,0.432611,0.047619,0.222222,6.4e-05,0.375,...,0,1,0,0,0,0,0.0,0,1,0.019686


Selecting feautres based on correlation

In [20]:
correlation_with_target = df.corr(numeric_only=True)['status'].sort_values(ascending=False)
print(correlation_with_target)

status                      1.000000
relationships               0.360434
age_last_milestone_year     0.358600
milestones                  0.328260
is_top500                   0.310652
age_first_milestone_year    0.250163
has_roundB                  0.208257
funding_rounds              0.206049
avg_participants            0.185992
has_roundA                  0.184307
has_roundC                  0.165902
has_roundD                  0.139940
is_MA                       0.081735
is_CA                       0.077217
is_enterprise               0.073772
age_last_funding_year       0.073731
is_NY                       0.059996
latitude                    0.046560
funding_velocity            0.044541
is_advertising              0.044355
funding_total_usd           0.040176
is_software                 0.012429
is_mobile                   0.007312
is_consulting               0.002373
is_biotech                  0.000104
is_web                     -0.000873
is_gamesvideo              -0.025893
l

In [21]:
selected_features = correlation_with_target[correlation_with_target.abs() > 0.1].index.tolist()
selected_clmn=set(selected_features)
print("Selected features:", selected_features)

Selected features: ['status', 'relationships', 'age_last_milestone_year', 'milestones', 'is_top500', 'age_first_milestone_year', 'has_roundB', 'funding_rounds', 'avg_participants', 'has_roundA', 'has_roundC', 'has_roundD', 'is_otherstate']


In [22]:
selected_clmn

{'age_first_milestone_year',
 'age_last_milestone_year',
 'avg_participants',
 'funding_rounds',
 'has_roundA',
 'has_roundB',
 'has_roundC',
 'has_roundD',
 'is_otherstate',
 'is_top500',
 'milestones',
 'relationships',
 'status'}

In [23]:
spearman_corr = df.corr(method='spearman', numeric_only=True)['status'].sort_values(ascending=False)
print(spearman_corr)

status                      1.000000
relationships               0.466948
age_last_milestone_year     0.410041
age_first_milestone_year    0.348102
milestones                  0.341207
is_top500                   0.310652
funding_velocity            0.260899
funding_total_usd           0.258853
funding_rounds              0.250965
avg_participants            0.226498
has_roundB                  0.208257
has_roundA                  0.184307
has_roundC                  0.165902
age_last_funding_year       0.143506
has_roundD                  0.139940
is_MA                       0.081735
is_CA                       0.077217
is_enterprise               0.073772
is_NY                       0.059996
is_advertising              0.044355
latitude                    0.041574
is_software                 0.012429
is_mobile                   0.007312
is_consulting               0.002373
is_biotech                  0.000104
is_web                     -0.000873
age_first_funding_year     -0.003527
i

In [24]:
selected_features = spearman_corr[spearman_corr.abs() > 0.1].index.tolist()
selected_clmn=selected_clmn.union(set(selected_features))
print("Selected features:", selected_features)

Selected features: ['status', 'relationships', 'age_last_milestone_year', 'age_first_milestone_year', 'milestones', 'is_top500', 'funding_velocity', 'funding_total_usd', 'funding_rounds', 'avg_participants', 'has_roundB', 'has_roundA', 'has_roundC', 'age_last_funding_year', 'has_roundD', 'is_otherstate']


In [25]:
selected_clmn

{'age_first_milestone_year',
 'age_last_funding_year',
 'age_last_milestone_year',
 'avg_participants',
 'funding_rounds',
 'funding_total_usd',
 'funding_velocity',
 'has_roundA',
 'has_roundB',
 'has_roundC',
 'has_roundD',
 'is_otherstate',
 'is_top500',
 'milestones',
 'relationships',
 'status'}

In [26]:
kendall_corr = df.corr(method='kendall', numeric_only=True)['status'].sort_values(ascending=False)
print(kendall_corr)

status                      1.000000
relationships               0.393116
age_last_milestone_year     0.340031
is_top500                   0.310652
milestones                  0.306398
age_first_milestone_year    0.290078
funding_rounds              0.228208
funding_velocity            0.213140
funding_total_usd           0.211924
has_roundB                  0.208257
avg_participants            0.194195
has_roundA                  0.184307
has_roundC                  0.165902
has_roundD                  0.139940
age_last_funding_year       0.117279
is_MA                       0.081735
is_CA                       0.077217
is_enterprise               0.073772
is_NY                       0.059996
is_advertising              0.044355
latitude                    0.034041
is_software                 0.012429
is_mobile                   0.007312
is_consulting               0.002373
is_biotech                  0.000104
is_web                     -0.000873
age_first_funding_year     -0.002888
i

In [27]:
selected_features = kendall_corr[kendall_corr.abs() > 0.1].index.tolist()
selected_clmn=selected_clmn.union(set(selected_features))
print("Selected features:", selected_features)

Selected features: ['status', 'relationships', 'age_last_milestone_year', 'is_top500', 'milestones', 'age_first_milestone_year', 'funding_rounds', 'funding_velocity', 'funding_total_usd', 'has_roundB', 'avg_participants', 'has_roundA', 'has_roundC', 'has_roundD', 'age_last_funding_year', 'is_otherstate']


In [28]:
selected_clmn

{'age_first_milestone_year',
 'age_last_funding_year',
 'age_last_milestone_year',
 'avg_participants',
 'funding_rounds',
 'funding_total_usd',
 'funding_velocity',
 'has_roundA',
 'has_roundB',
 'has_roundC',
 'has_roundD',
 'is_otherstate',
 'is_top500',
 'milestones',
 'relationships',
 'status'}

In [29]:
from sklearn.feature_selection import mutual_info_classif

X = df.drop('status', axis=1)
y = df['status']

# Encode categorical variables if needed
mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(mi_series)


relationships               0.122586
age_last_milestone_year     0.097682
age_first_milestone_year    0.096690
avg_participants            0.068183
funding_velocity            0.066284
funding_total_usd           0.062706
funding_rounds              0.050417
age_first_funding_year      0.047984
longitude                   0.047228
is_top500                   0.046214
milestones                  0.035927
age_last_funding_year       0.034704
has_angel                   0.028357
has_roundA                  0.017862
is_mobile                   0.017456
is_MA                       0.014539
is_advertising              0.014510
is_software                 0.012229
is_othercategory            0.010258
latitude                    0.010168
is_consulting               0.009442
is_otherstate               0.005648
is_CA                       0.005468
is_enterprise               0.004573
is_TX                       0.004304
is_biotech                  0.004195
is_web                      0.003855
h

In [30]:
selected_features = mi_series[mi_series.abs() > 0.1].index.tolist()
selected_clmn=selected_clmn.union(set(selected_features))
print("Selected features:", selected_features)

Selected features: ['relationships']


In [31]:
selected_clmn

{'age_first_milestone_year',
 'age_last_funding_year',
 'age_last_milestone_year',
 'avg_participants',
 'funding_rounds',
 'funding_total_usd',
 'funding_velocity',
 'has_roundA',
 'has_roundB',
 'has_roundC',
 'has_roundD',
 'is_otherstate',
 'is_top500',
 'milestones',
 'relationships',
 'status'}

In [32]:
df.head(1)

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,...,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status,funding_velocity
0,0.494494,0.367152,0.365061,0.389409,0.484841,0.432611,0.047619,0.222222,6.4e-05,0.375,...,0,1,0,0,0,0,0.0,0,1,0.019686


In [33]:
ndf=df[list(selected_clmn)]

In [34]:
ndf.head(1)

Unnamed: 0,funding_total_usd,is_top500,avg_participants,has_roundD,age_last_milestone_year,has_roundB,relationships,age_first_milestone_year,is_otherstate,milestones,funding_rounds,has_roundA,has_roundC,funding_velocity,age_last_funding_year,status
0,6.4e-05,0,0.0,0,0.432611,0,0.047619,0.484841,0,0.375,0.222222,0,0,0.019686,0.389409,1


Spliting data into train and test dataset

In [35]:
from sklearn.model_selection import train_test_split

X = ndf.drop('status', axis = 1)

y = ndf['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
print("Shape of the X Train :", X_train.shape)
print("Shape of the y Train :", y_train.shape)
print("Shape of the X test :", X_test.shape)
print("Shape of the y test :", y_test.shape)

Shape of the X Train : (738, 15)
Shape of the y Train : (738,)
Shape of the X test : (185, 15)
Shape of the y test : (185,)


Applying various machine learning and deep learning model

In [37]:
# Model Build
from sklearn.metrics import  classification_report
import warnings
warnings.filterwarnings('ignore')

In [38]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()


gnb.fit(X_train,y_train)

y_predicted_gnb = gnb.predict(X_test)

print("Training Accuracy :", gnb.score(X_train, y_train))
print("Testing Accuracy :", gnb.score(X_test, y_test))

cr = classification_report(y_test, y_predicted_gnb)
print(cr)


Training Accuracy : 0.5745257452574526
Testing Accuracy : 0.5351351351351351
              precision    recall  f1-score   support

           0       0.45      0.85      0.59        73
           1       0.77      0.33      0.46       112

    accuracy                           0.54       185
   macro avg       0.61      0.59      0.53       185
weighted avg       0.65      0.54      0.51       185



In [39]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(X_train,y_train)

y_predicted_knc = knc.predict(X_test)

print("Training Accuracy :", knc.score(X_train, y_train))
print("Testing Accuracy :", knc.score(X_test, y_test))



cr = classification_report(y_test, y_predicted_knc)
print(cr)

Training Accuracy : 0.8035230352303523
Testing Accuracy : 0.7297297297297297
              precision    recall  f1-score   support

           0       0.71      0.53      0.61        73
           1       0.74      0.86      0.79       112

    accuracy                           0.73       185
   macro avg       0.72      0.70      0.70       185
weighted avg       0.73      0.73      0.72       185



In [40]:
from lightgbm import LGBMClassifier
lgmc = LGBMClassifier()

lgmc.fit(X_train,y_train)

y_predicted_lgmc = lgmc.predict(X_test)

print("Training Accuracy :", lgmc.score(X_train, y_train))
print("Testing Accuracy :", lgmc.score(X_test, y_test))


cr = classification_report(y_test, y_predicted_lgmc)
print(cr)

[LightGBM] [Info] Number of positive: 485, number of negative: 253
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1319
[LightGBM] [Info] Number of data points in the train set: 738, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.657182 -> initscore=0.650759
[LightGBM] [Info] Start training from score 0.650759
Training Accuracy : 1.0
Testing Accuracy : 0.745945945945946
              precision    recall  f1-score   support

           0       0.74      0.55      0.63        73
           1       0.75      0.88      0.81       112

    accuracy                           0.75       185
   macro avg       0.74      0.71      0.72       185
weighted avg       0.75      0.75      0.74       185



In [41]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

y_predicted_svc = svc.predict(X_test)

print("Training Accuracy :", svc.score(X_train, y_train))
print("Testing Accuracy :", svc.score(X_test, y_test))

cr = classification_report(y_test, y_predicted_svc)
print(cr)

Training Accuracy : 0.7791327913279132
Testing Accuracy : 0.745945945945946
              precision    recall  f1-score   support

           0       0.80      0.48      0.60        73
           1       0.73      0.92      0.81       112

    accuracy                           0.75       185
   macro avg       0.76      0.70      0.71       185
weighted avg       0.76      0.75      0.73       185



In [42]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

y_predicted_lr = lr.predict(X_test)

print("Training Accuracy :", lr.score(X_train, y_train))
print("Testing Accuracy :", lr.score(X_test, y_test))


cr = classification_report(y_test, y_predicted_lr)
print(cr)

Training Accuracy : 0.7682926829268293
Testing Accuracy : 0.7567567567567568
              precision    recall  f1-score   support

           0       0.79      0.52      0.63        73
           1       0.74      0.91      0.82       112

    accuracy                           0.76       185
   macro avg       0.77      0.72      0.72       185
weighted avg       0.76      0.76      0.74       185



In [49]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler





model = Sequential([
    Dense(15, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(30, activation='relu'),
    Dense(60, activation='relu'),
    Dense(30, activation='relu'),
    Dense(15, activation='relu'),
    Dense(7, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.6589 - loss: 0.6582 - val_accuracy: 0.6757 - val_loss: 0.6169
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6473 - loss: 0.6182 - val_accuracy: 0.6757 - val_loss: 0.6018
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6902 - loss: 0.5667 - val_accuracy: 0.7230 - val_loss: 0.5927
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7334 - loss: 0.5759 - val_accuracy: 0.7162 - val_loss: 0.5862
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7346 - loss: 0.5754 - val_accuracy: 0.7230 - val_loss: 0.5797
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7390 - loss: 0.5640 - val_accuracy: 0.7027 - val_loss: 0.5722
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x78e608b6d190>

In [50]:
y_predicted_ann = model.predict(X_test)
y_predicted_ann = (y_predicted_ann > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predicted_ann)
cr = classification_report(y_test, y_predicted_ann)
print('accuracy:'+str(accuracy))
print(cr)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
accuracy:0.7675675675675676
              precision    recall  f1-score   support

           0       0.83      0.52      0.64        73
           1       0.75      0.93      0.83       112

    accuracy                           0.77       185
   macro avg       0.79      0.72      0.73       185
weighted avg       0.78      0.77      0.75       185



In [45]:
from google.colab import files
X_test.to_csv('X_test.csv', index=False)
files.download('X_test.csv')
from google.colab import files
y_test.to_csv('y_test.csv', index=False)
files.download('y_test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [46]:
X_test

Unnamed: 0,funding_total_usd,is_top500,avg_participants,has_roundD,age_last_milestone_year,has_roundB,relationships,age_first_milestone_year,is_otherstate,milestones,funding_rounds,has_roundA,has_roundC,funding_velocity,age_last_funding_year
323,3.489480e-04,1,0.177780,0,0.363189,0,0.253968,0.465520,0,0.250,0.222222,0,0,0.020398,0.391623
861,1.366494e-03,1,0.066667,0,0.348838,0,0.047619,0.332463,0,0.250,0.000000,0,0,0.023473,0.374268
30,1.840179e-03,1,0.066667,0,0.363621,1,0.047619,0.480962,0,0.125,0.111111,1,0,0.025464,0.362317
837,1.401582e-03,1,0.133333,0,0.410478,0,0.031746,0.519179,0,0.125,0.000000,0,0,0.020854,0.606871
294,2.301583e-03,1,0.066667,0,0.252616,0,0.015873,0.390425,0,0.125,0.444444,1,0,0.023457,0.453073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,6.085801e-03,1,0.266667,1,0.460796,1,0.587302,0.468905,0,0.500,0.555556,1,1,0.028114,0.493803
827,7.017557e-07,0,0.000000,0,0.221061,0,0.015873,0.364689,1,0.125,0.000000,0,0,0.019530,0.333894
490,2.015618e-03,1,0.000000,0,0.499959,1,1.000000,0.426457,0,0.750,0.111111,1,0,0.029051,0.329909
753,5.243870e-04,1,0.066667,0,0.301375,0,0.095238,0.395220,0,0.250,0.000000,1,0,0.021594,0.343721


In [47]:
import pickle
with open("fundraiseprediction_model.pkl", "wb") as f:
    pickle.dump(model, f)