# Lead Convertion Predictive Model

In [8]:
import os

In [9]:
basedir = 'C:\github\lead_convertion_predictive_model'
libdir = 'C:\github\lead_convertion_predictive_model\src'

In [10]:
os.chdir(libdir)

In [11]:
import numpy as np
from data_cleaning import DataCleaner
from features_engineering import FeatureExtractor
from model_selection import ModelSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
os.chdir(basedir)

## Data science pipeline in action

In [13]:
def run_benchmark():
    # read and clean the data
    dc = DataCleaner()
    data = dc.clean()

    # separate target variable
    target = data.pop('Target')

    # train test split
    data_train, data_test, target_train, target_test = train_test_split(data, target)

    # featurize data
    featurizer = FeatureExtractor()
    X_train = featurizer.featurize(data_train)
    X_test = featurizer.featurize(data_test)

    # Convert to numpy arrays
#    X_train = X_train.reshape(-1, 1)
#    X_test = X_test.reshape(-1, 1)
    y_train = np.array(target_train)
    y_test = np.array(target_test)

    # Select model
    ms = ModelSelector()
    best_model = ms.get_best_model(X_train, X_test, y_train, y_test)

    # Print model scores
    ms.print_model_scores()
    
    # Print used features
    print 
    print 'Features used:', featurizer.get_features_names()
    
    clf = ms.get_all_classifiers()
    return clf, X_train, X_test, y_train, y_test

In [16]:
clf, X_train, X_test, y_train, y_test = run_benchmark()
print ' # Baseline won ratio: {:.5f}'.format(np.mean(y_test))

<class 'sklearn.linear_model.logistic.LogisticRegression'>
 #  Accuracy: 0.62201
 # Precision: 0.66314
 #    Recall: 0.80924
 #  F1 Score: 0.72894
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
 #  Accuracy: 0.61582
 # Precision: 0.70422
 #    Recall: 0.66951
 #  F1 Score: 0.68643
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
 #  Accuracy: 0.66795
 # Precision: 0.67199
 #    Recall: 0.92072
 #  F1 Score: 0.77694

Features used: [u'Order.Entry.CHF', u'Customer.Industry', 'Year']
 # Baseline won ratio: 0.62806


In [17]:
lr, rf, gb = clf
lr_predictions = lr.predict(X_test)
rf_predictions = rf.predict(X_test)
gb_predictions = gb.predict(X_test)
print y_test[:10]
print lr_predictions[:10]
print rf_predictions[:10]
print gb_predictions[:10]
print np.mean(lr_predictions)
print np.mean(rf_predictions)
print np.mean(gb_predictions)

[0 1 1 1 1 1 0 1 1 1]
[1 1 1 1 0 1 0 1 0 1]
[1 1 0 1 1 1 0 1 0 1]
[1 1 1 1 0 1 1 1 0 1]
0.76643741403
0.597111416781
0.860522696011
