## 0. Load the data set into Pandas for a first look 

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/leads.csv',sep = ',',index_col=0)

In [5]:
df.head(10)

Unnamed: 0,conversionIndex,hits,sessions,timeSpan,freq,pageView,timeSite,recency
1,converted,115,15,3487.430833,0.004301,86,9248.227,510587.574
2,converted,747,65,2420.979916,0.026849,631,41107.365,254901.9
3,converted,57,2,33.217291,0.06021,39,1744.337,117837.909
4,converted,772,56,3278.984562,0.017078,527,35860.027,302265.781
5,converted,217,8,1232.168893,0.006493,114,6476.086,11397.767
6,converted,260,10,1526.218268,0.006552,177,13556.461,650750.146
7,converted,831,77,1913.524176,0.04024,652,35857.966999,101816.0
8,converted,1089,43,1639.570187,0.026226,477,46121.375,6816.118
9,converted,51,2,2.057013,0.972284,39,5459.559,1945.688
10,converted,467,45,1777.158351,0.025321,334,20272.104,9.0


In [12]:
df['conversionIndex'].value_counts()

active       10399
converted     3456
Name: conversionIndex, dtype: int64

In [13]:
df.describe()

Unnamed: 0,hits,sessions,timeSpan,freq,pageView,timeSite,recency
count,13855.0,13855.0,13855.0,13855.0,13855.0,13855.0,13855.0
mean,85.155612,6.8441,380.126689,1505.234302,58.261494,4960.29,998109.4
std,248.171981,15.92488,733.481523,18510.832201,156.650294,26204.51,665906.9
min,2.0,1.0,1e-06,0.000876,1.0,0.0,0.0
25%,6.0,1.0,0.067416,0.017889,4.0,166.789,582921.3
50%,16.0,2.0,1.729795,1.146751,10.0,617.831,1294587.0
75%,67.0,5.0,428.799943,16.101691,42.0,3558.329,1295830.0
max,13046.0,440.0,3845.714025,719983.520885,6371.0,1979672.0,12050260.0


In [17]:
Y = df.conversionIndex.apply(lambda x:1 if x == 'converted' else 0)
X = df.ix[:,1:7]

## 1.Build a logistic regression model to predict lead's converstion
###  a. Randomly split the 70% data into training and 30% to testing

In [20]:
rng = np.random.RandomState(42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=rng)

### b. Train a logistic regression model on the training set 

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

clf = LogisticRegression(C=1e10)   #de-regularize since sklearn uses L2 regularization by default
pipe = Pipeline(steps=[('logistic',clf)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('logistic', LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

### Test on the testing data set and generate F-1 score

In [106]:
y_preds = pipe.predict(X_test)

In [107]:
print 'F-1 Score: %.4f' % f1_score(y_test,y_preds)

F-1 Score: 0.8711


### c. Repeat 5 times

In [108]:
for i in range(0,5):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    pipe.fit(X_train,y_train)
    y_preds = pipe.predict(X_test)
    print 'F-1 Score: %.4f' % f1_score(y_test,y_preds)

F-1 Score: 0.8733
F-1 Score: 0.8737
F-1 Score: 0.8610
F-1 Score: 0.8620
F-1 Score: 0.8677


### d. Use 5-fold cross-validation 

In [109]:
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score,cross_val_predict

# scores = cross_val_score(pipe,X_train, y_train,cv=5)
# kf = KFold(n_splits = 5)

for k, (train_index,test_index) in enumerate(kf.split(X_train,y_train)):
    clf.fit(X_train.iloc[train_index,:],y_train.iloc[train_index])
    y_preds_kfold = pipe.predict(X_train.iloc[test_index,:])
    print("fold %s F-1 score:%.4f" % (k,f1_score(y_train.iloc[test_index],y_preds_kfold)))
    if k == 4:
        y_preds_test =  clf.predict(X_test)
        print("fold %s F-1 score for Test:%.4f" % (k,f1_score(y_test,y_preds_test)))
   

fold 0 F-1 score:0.8678
fold 1 F-1 score:0.8691
fold 2 F-1 score:0.8707
fold 3 F-1 score:0.8624
fold 4 F-1 score:0.8733
fold 4 F-1 score for Test:0.8751


Use 5-fold cross-validation on the training data set to select the best model to be models using Fold 4. Tested on the testing data set to compare the F-1 scores to be 0.8698, higher than the score in (c).

## 2. Feature selection 

### a. Select features using forward step wise approach.
https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2


In [102]:
from sklearn import feature_selection

model = feature_selection.SelectKBest(score_func=feature_selection.f_regression,k=6)
results = model.fit(X_train, y_train)

print results.scores_
print results.pvalues_

[ 1703.62900196  3300.03217412  6261.21155415    21.64616029  2696.32504387
   585.22865062]
[  0.00000000e+000   0.00000000e+000   0.00000000e+000   3.32208732e-006
   0.00000000e+000   1.37077146e-125]


In [103]:
columns = X_train.columns
for col in columns:
    lm = LogisticRegression()
    lm.fit(X_train[[col]], y_train)
    print lm.score(X_train[[col]], y_train)

0.870798102702
0.882965559909
0.857496391008
0.753351206434
0.896370385647
0.922767580945


## 3. Regularization 

In [117]:
# Set regularization parameter
for i, C in enumerate((100, 1, 0.01,1e10)):
    clf_l2 = LogisticRegression(penalty="l2", C=C,tol=0.01)
    clf_l2.fit(X_train,y_train)
    coef_l2 = clf_l2.coef_.ravel()
    print("c = %.2f" % C)
    print("F-1 score with L2 penalty: %.4f" % f1_score(y_test,clf_l2.predict(X_test)))
    print("coefficients",coef_l2)

c = 100.00
score with L2 penalty: 0.7177
('coefficients', array([ -1.11801388e-02,  -3.65663152e-03,   1.17044171e-06,
        -6.99579881e-02,   1.23118727e-02,   1.82854422e-04]))
c = 1.00
score with L2 penalty: 0.7177
('coefficients', array([ -1.11802008e-02,  -3.65663647e-03,   1.17142828e-06,
        -6.99579322e-02,   1.23118364e-02,   1.82856137e-04]))
c = 0.01
score with L2 penalty: 0.7177
('coefficients', array([ -1.11863951e-02,  -3.65713116e-03,   1.27006529e-06,
        -6.99523426e-02,   1.23082110e-02,   1.83027508e-04]))
c = 10000000000.00
score with L2 penalty: 0.7177
('coefficients', array([ -1.11801382e-02,  -3.65663147e-03,   1.17043173e-06,
        -6.99579886e-02,   1.23118731e-02,   1.82854405e-04]))


### b) Lasso 

In [119]:
for i, C in enumerate((100, 1, 0.01,1e10)):
    clf_l1 = LogisticRegression(penalty="l1", C=C,tol=0.01)
    clf_l1.fit(X_train,y_train)
    coef_l1 = clf_l1.coef_.ravel()
    print("c = %.2f" % C)
    print("F-1 score with L2 penalty: %.4f" % f1_score(y_test,clf_l1.predict(X_test)))
    print "Coefficients",coef_l1

c = 100.00
score with L2 penalty: 0.8267
Coefficients [-0.00571802  0.01640109  0.00146459 -0.00012128  0.00681048  0.00035698]
c = 1.00
score with L2 penalty: 0.8369
Coefficients [ -1.63138568e-03   1.73022561e-02   1.36197449e-03  -4.89618540e-05
   2.44172994e-03   3.58914489e-04]
c = 0.01
score with L2 penalty: 0.8348
Coefficients [ -1.08808859e-02   3.74491072e-03   1.38785157e-03  -4.94104950e-05
   1.31005941e-02   3.59526209e-04]
c = 10000000000.00
score with L2 penalty: 0.8388
Coefficients [ -8.79189650e-03  -2.68879979e-02   1.58246952e-03  -8.75322976e-05
   1.63897909e-02   3.53924908e-04]


### c) Elastic Net 

In [122]:
from sklearn.linear_model import SGDClassifier

alpha = 0.1

for i,l1_ratio in enumerate((0.1,0.4,0.5,0.7,0.9)):
    enet = SGDClassifier(loss="log", penalty="elasticnet",alpha = alpha,l1_ratio = l1_ratio)
    y_pred_enet = enet.fit(X_train,y_train).predict(X_test)
    print "L1 ratio: %.2f, F-1 score with Elastic Net: %.4f" % (l1_ratio,f1_score(y_test,y_pred_enet))


L1 ratio: 0.10, F-1 score with Elastic Net: 0.5384
L1 ratio: 0.40, F-1 score with Elastic Net: 0.6685
L1 ratio: 0.50, F-1 score with Elastic Net: 0.5643
L1 ratio: 0.70, F-1 score with Elastic Net: 0.7006
L1 ratio: 0.90, F-1 score with Elastic Net: 0.5589
