## Model Build (part 1)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout
import csv
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
model_data = pd.read_csv('model_data.csv')

In [None]:
model_data.columns

## Feature Engineering

In [None]:
# feature engineering
# engineer a number out of stage
model_data.stage.value_counts()

In [None]:
# Closed Won, Deal Signed, Invoice Sent = 1; otherwise 0

In [None]:
def translate_stage(stage):
    if stage in ['Closed Won', 'Deal Signed', 'Invoice Sent']:
        return(1)
    else:
        return (0)

In [None]:
model_data['y'] = model_data['stage'].apply(translate_stage)

In [None]:
# feature engineering
y = model_data['y']
X = model_data[['lat',
                'lng',
                'mobility_score',
                'carshare',
                'bikeshare',
                'ridehailing',
                'masstransit',
                'closest_ts',
                'within_one_tenth',
                'within_one_half',
                'within_one',
                #'within_five'  #this was taken out because it caused scores to decrease
                ]]

## Split Data

In [None]:
# may need to delete outliers here, or put a max on closest_ts

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=14)

In [None]:
# calculate baseline
print(y.value_counts())
print(1 - (456 / (1227+456)))

## Run Models

In [None]:
# random forest  (for grid search code, see notebook 09)
model = RandomForestClassifier(
    n_estimators=500, max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features=3)
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

In [None]:
model.fit(X_train, y_train)

In [None]:
features = pd.DataFrame(list(zip(X.columns, model.feature_importances_)), columns=[
                        'feature', 'importance'])
features.plot(kind='bar', title='Random Forest Feature Importance',
              x='feature', y='importance', fontsize='large', legend=False, sort_columns=True)
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
# GRADIENT BOOSTING model
model = GradientBoostingClassifier(max_features=6, max_depth=50)
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

In [None]:
model.fit(X_train, y_train)

In [None]:
features = pd.DataFrame(list(zip(X.columns, model.feature_importances_)), columns=[
                        'feature', 'importance'])
features.plot(kind='bar', title='Gradient Boost Feature Importance',
              x='feature', y='importance', fontsize='large', legend=False, sort_columns=True)
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
# ADABoost model
model = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(model, X_train, y_train, cv=3)
print(scores)
print(np.mean(scores))

In [None]:
model.fit(X_train, y_train)

In [None]:
features = pd.DataFrame(list(zip(X.columns, model.feature_importances_)), columns=[
                        'feature', 'importance'])
features.plot(kind='bar', title='AdaBoost Feature Importance',
              x='feature', y='importance', fontsize='large', legend=False, sort_columns=True)
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
# Create keras Model
#X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=11)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
ss = StandardScaler()
# the scaler is fit only to the training data
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

model = Sequential()

input_units = X_train.shape[1]  # number of features in training set
hidden_units = input_units  # hidden layer has the same number of nodes as input

# first input layer
model.add(Dense(hidden_units, input_dim=input_units, activation='relu'
                # uncomment this to add L2 regularization
                #,kernel_regularizer=regularizers.l2(0.0001)
                ))


# hidden layer (try with and without)
node_reduction = 0
model.add(Dense(hidden_units - node_reduction, input_dim=input_units, activation='tanh'
                #,kernel_regularizer=regularizers.l2(0.0001)
                ))
# model.add(Dropout(0.8))

# final layer
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam'              # added later
              , metrics=['binary_accuracy']
              )

In [None]:
# Run Keras model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=60, batch_size=None, verbose=1)

### Model Score Summary:
1. Random Forest:   crossval score 76.5%
2. Gradient Boost:  crossval score 71.6%
3. ADABoost:        crossval score 74.2%
4. Keras:           validation score 77.2%

More model buiding and grid searching in notebook 9


