Importing modules

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf

from sklearn.metrics import r2_score

Retrieving Data

In [3]:
data = pd.read_csv('startup_funding.csv')
#Viewing data
# data

Unnamed: 0,SNo,Date,StartupName,IndustryVertical,SubVertical,CityLocation,InvestorsName,InvestmentType,AmountInUSD,Remarks
0,0,01-08-2017,TouchKin,Technology,Predictive Care Platform,Bangalore,Kae Capital,Private Equity,1300000,
1,1,02-08-2017,Ethinos,Technology,Digital Marketing Agency,Mumbai,Triton Investment Advisors,Private Equity,,
2,2,02-08-2017,Leverage Edu,Consumer Internet,Online platform for Higher Education Services,New Delhi,"Kashyap Deorah, Anand Sankeshwar, Deepak Jain,...",Seed Funding,,
3,3,02-08-2017,Zepo,Consumer Internet,DIY Ecommerce platform,Mumbai,"Kunal Shah, LetsVenture, Anupam Mittal, Hetal ...",Seed Funding,500000,
4,4,02-08-2017,Click2Clinic,Consumer Internet,healthcare service aggregator,Hyderabad,"Narottam Thudi, Shireesh Palle",Seed Funding,850000,
...,...,...,...,...,...,...,...,...,...,...
2367,2367,29-01-2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000,
2368,2368,29-01-2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000,Govt backed VC Fund
2369,2369,30-01-2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000,
2370,2370,30-01-2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


Pre-processing

In [17]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID and high-cardinality columns
    df = df.drop(['SNo', 'StartupName', 'SubVertical', 'InvestorsName'], axis=1)
    
    # Clean \\xc2\\xa0 examples
    df = df.applymap(lambda x: x.replace(r'\\xc2\\xa0', '') if type(x) == str else x)
    
    # Clean target column
    df['AmountInUSD'] = df['AmountInUSD'].apply(lambda x: x.replace(',', '') if str(x) != 'nan' else x)
    df['AmountInUSD'] = df['AmountInUSD'].replace({
        'undisclosed': np.NaN,
        'unknown': np.NaN,
        'Undisclosed': np.NaN,
        'N/A': np.NaN,
        '14342000+': '14342000'
    })
    
    # Drop missing target rows
    missing_target_rows = df[df['AmountInUSD'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Drop columns with more than 25% missing values
    df = df.drop('Remarks', axis=1)
    
    # Fill categorical missing values with most frequent occurence
    for column in ['IndustryVertical', 'CityLocation', 'InvestmentType']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Clean date column
    df['Date'] = df['Date'].replace({
        '05/072018': '05/07/2018',
        '01/07/015': '01/07/2015',
        '22/01//2015': '22/01/2015'
    })
    
    # Extract date features
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['Day'] = df['Date'].apply(lambda x: x.day)
    df = df.drop('Date', axis=1)
    
    # Convert target column to float
    df['AmountInUSD'] = df['AmountInUSD'].astype(float)
    
    # Split df into X and y
    y = df['AmountInUSD']
    X = df.drop('AmountInUSD', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [19]:
# Viewing train set X
# X_train

Unnamed: 0,IndustryVertical,CityLocation,InvestmentType,Year,Month,Day
1453,Consumer Internet,Bangalore,Private Equity,2015,5,2
275,Technology,Pune,Private Equity,2017,1,25
1130,mobile payment software platform,Bangalore,Private Equity,2015,10,8
1153,"Hyperlocal Maintenance, Repair & Cleaning serv...",Mumbai,Private Equity,2015,8,19
1172,CRM / Analytics platform,Chennai,Seed Funding,2015,1,7
...,...,...,...,...,...,...
715,Consumer Internet,Bangalore,Seed Funding,2016,7,3
905,Virtual Health consultation app,Mumbai,Private Equity,2015,12,16
1096,Employee OnBoarding & Orientation platform,Noida,Seed Funding,2015,9,24
235,Consumer Internet,Gurgaon,Seed Funding,2017,2,18


In [20]:
# Viewing train set Y
# y_train

1453    18000000.0
275      4000000.0
1130     1000000.0
1153     2400000.0
1172     1000000.0
           ...    
715       500000.0
905      1200000.0
1096      400000.0
235        40000.0
1061      770000.0
Name: AmountInUSD, Length: 1067, dtype: float64

Building Pipeline

In [34]:
def build_model():
    inputs = tf.keras.Input(shape=(422,))
    x = tf.keras.layers.Dense(128, activation='relu')(inputs)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer='adam',
        loss='mse'
    )
    
    return model

In [None]:
nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, ['IndustryVertical', 'CityLocation', 'InvestmentType'])
], remainder='passthrough')

regressor = tf.keras.wrappers.scikit_learn.KerasRegressor(build_model)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', regressor)
])

Training

In [36]:
model.fit(
    X_train,
    y_train,
    regressor__validation_split=0.2,
    regressor__batch_size=32,
    regressor__epochs=100,
    regressor__callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=1,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Results

In [37]:
y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("     Test RMSE: {:.2f}".format(rmse))

r2 = r2_score(y_test, y_pred)
print("Test R^2 Score: {:.5f}".format(r2))

     Test RMSE: 72405476.35
Test R^2 Score: -0.01587
