In [104]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout
import csv
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [105]:
model_data = pd.read_csv('model_data.csv')

In [106]:
model_data.columns

Index(['id', 'name', 'close_date', 'stage', 'address', 'lat', 'lng',
       'ts_response', 'mobility_score', 'carshare', 'bikeshare', 'ridehailing',
       'masstransit', 'lat_lng', 'closest_ts', 'within_one_tenth',
       'within_one_half', 'within_one', 'within_five'],
      dtype='object')

In [107]:
#feature engineering
#engineer a number out of stage
model_data.stage.value_counts()

Lost                 926
Closed Won           295
Deal Signed           97
Contract Sent         75
Presentation          65
Invoice Sent          64
Proposal Sent         62
Qualified Lead        60
Research/Donation     27
Contract Expired       8
Contract Review        3
Pilot                  1
Name: stage, dtype: int64

In [None]:
#Closed, Deal Signed, Invoice Sent = 1; otherwise 0

In [108]:
def translate_stage(stage):
    if stage in ['Closed Won', 'Deal Signed', 'Invoice Sent']:
        return(1)
    else:
        return (0)

In [109]:
model_data['y'] = model_data['stage'].apply(translate_stage)

In [195]:
#feature engineering
y = model_data['y']
X = model_data[['lat',
                'lng',
                'mobility_score',
                'carshare',
                'bikeshare',
                'ridehailing',
                'masstransit',
                'closest_ts',
                'within_one_tenth',
                'within_one_half',
                'within_one',
                #'within_five'
                ]]

In [None]:
#may need to delete outliers here, or put a max on closest_ts

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=14)

In [197]:
#calculate baseline
print (y.value_counts())
print (1 - (456 / (1227+456)))

0    1227
1     456
Name: y, dtype: int64
0.7290552584670231


In [209]:
#random forest
model = RandomForestClassifier(max_features = 6, max_depth = 20) 
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

[0.72264631 0.73536896 0.71173469]
0.7232499870177079


In [212]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [213]:
model.score(X_train, y_train)

0.9422750424448217

In [214]:
model.score(X_test, y_test)

0.7465346534653465

In [218]:
#######GRADIENT BOOSTING model
model = GradientBoostingClassifier(max_features = 6, max_depth = 50) 
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

[0.70483461 0.72264631 0.71683673]
0.7147725502414707


In [200]:
####### ADABoost model
model = AdaBoostClassifier(n_estimators=100) 
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

[0.72773537 0.74300254 0.75510204]
0.7419466514341105


In [201]:
#Create keras Model
#X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=11)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)  #the scaler is fit only to the training data
X_test = ss.transform(X_test)

model = Sequential()

input_units = X_train.shape[1] #number of features in training set
hidden_units = input_units   #hidden layer has the same number of nodes as input

#first input layer
model.add(Dense(hidden_units            
                ,input_dim=input_units  
                ,activation='relu'
                #uncomment this to add L2 regularization
                #,kernel_regularizer=regularizers.l2(0.0001) 
               ))


#hidden layer (try with and without)
node_reduction = 0
model.add(Dense(hidden_units - node_reduction          
                ,input_dim=input_units  
                ,activation='tanh'
                #,kernel_regularizer=regularizers.l2(0.0001) 
               ))
#model.add(Dropout(0.8))

#final layer
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy'
              ,optimizer='adam'
               #added later 
              ,metrics=['binary_accuracy']
             )

(1178, 11) (1178,)
(505, 11) (505,)


In [202]:
#Run Keras model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
               epochs=60, batch_size=None, verbose=1)

Train on 1178 samples, validate on 505 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60


Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


### Model Score Summary:
Random Forest:   crossval score 72.3%
Gradient Boost:  crossval score 71.3%
ADABoost:        crossval score 74.2%
Keras:           validation score 77.2%


