In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import *
from sklearn.metrics import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.learning_curve import learning_curve
import random

In [2]:
df = pd.read_csv("train.csv", header=0, delimiter=",",quoting=10)
dftest = pd.read_csv("test.csv", header=0, delimiter=",",quoting=10)

In [3]:
X = df[["RESOURCE", "MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", 
                  "ROLE_DEPTNAME","ROLE_TITLE", "ROLE_FAMILY_DESC", "ROLE_FAMILY"]]
X_test = dftest[["RESOURCE", "MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", 
                  "ROLE_DEPTNAME","ROLE_TITLE", "ROLE_FAMILY_DESC", "ROLE_FAMILY"]]
y= df[['ACTION']]

In [4]:
# From categorical to numerical data
enc = OneHotEncoder()
enc.fit(np.vstack((X, X_test)))
X=enc.transform(X)
X_test=enc.transform(X_test)
y = y.as_matrix()
y = np.reshape(y,[32769L,])

In [9]:
#X_train, X_cv, y_train, y_cv = train_test_split(
#            X, y, test_size=.20, random_state=42)

In [10]:
# Create classifier. C=1 seems the best option after testing
clf=LogisticRegression(C=1)

In [11]:
#Parameters for classifier
max_iter = range(50,300)
penalty = ['l2']
solver = ['liblinear', 'newton-cg', 'lbfgs', 'sag']
verbose = range(0,10)

In [12]:
param_dist = dict(max_iter=max_iter,
                 penalty=penalty, 
                 solver=solver,
                 verbose=verbose)

In [24]:
# Find optimal parameters
rand=RandomizedSearchCV(clf, param_dist, cv=10, scoring='roc_auc', n_iter=20, 
                        random_state=5, n_jobs=-1)
rand.fit(X, y)
rand.grid_scores_

[LibLinear]

[mean: 0.86579, std: 0.01150, params: {'penalty': 'l2', 'max_iter': 122, 'verbose': 5, 'solver': 'sag'},
 mean: 0.86579, std: 0.01150, params: {'penalty': 'l2', 'max_iter': 106, 'verbose': 4, 'solver': 'newton-cg'},
 mean: 0.86579, std: 0.01150, params: {'penalty': 'l2', 'max_iter': 151, 'verbose': 9, 'solver': 'sag'},
 mean: 0.86579, std: 0.01150, params: {'penalty': 'l2', 'max_iter': 297, 'verbose': 7, 'solver': 'sag'},
 mean: 0.86587, std: 0.01149, params: {'penalty': 'l2', 'max_iter': 126, 'verbose': 6, 'solver': 'liblinear'},
 mean: 0.86587, std: 0.01149, params: {'penalty': 'l2', 'max_iter': 232, 'verbose': 6, 'solver': 'liblinear'},
 mean: 0.86587, std: 0.01149, params: {'penalty': 'l2', 'max_iter': 188, 'verbose': 0, 'solver': 'liblinear'},
 mean: 0.86582, std: 0.01150, params: {'penalty': 'l2', 'max_iter': 75, 'verbose': 2, 'solver': 'sag'},
 mean: 0.86579, std: 0.01149, params: {'penalty': 'l2', 'max_iter': 68, 'verbose': 0, 'solver': 'lbfgs'},
 mean: 0.86579, std: 0.01151, p

In [25]:
print rand.best_score_
print rand.best_params_

0.865865079047
{'penalty': 'l2', 'max_iter': 126, 'verbose': 6, 'solver': 'liblinear'}


In [26]:
# Train and test classifier using 10- Fold cross validation on the classifier 
best_roc_auc = None
best_fpr = None
best_tpr = None
skf = StratifiedKFold(y, n_folds=10)
for train_index, cv_index in skf:
    X_train, X_cv = X[train_index], X[cv_index]
    y_train, y_cv = y[train_index], y[cv_index]
    clf=LogisticRegression(C=1, 
                           penalty = 'l2', 
                           max_iter = 126, 
                           verbose = 6, 
                           solver = 'liblinear')
    clf.fit(X_train, y_train)
    predictions=clf.predict_proba(X_cv)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_cv, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    if roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        best_fpr = fpr
        best_tpr = tpr
    print 'roc_auc', roc_auc
print 'best_roc_auc', best_roc_auc

[LibLinear]roc_auc 0.879356422143
[LibLinear]roc_auc 0.86851479411
[LibLinear]roc_auc 0.883425400235
[LibLinear]roc_auc 0.862830545752
[LibLinear]roc_auc 0.848025676436
[LibLinear]roc_auc 0.855657852113
[LibLinear]roc_auc 0.855032138169
[LibLinear]roc_auc 0.858136098985
[LibLinear]roc_auc 0.880471614194
[LibLinear]roc_auc 0.867198680934
best_roc_auc 0.883425400235


In [27]:
#Predict on test set
predictions=clf.predict_proba(X_test)[:,1]

In [28]:
#Create submissionfile
def create_submissionfile(predictions, filename):
    """Save predictions in a csv file"""
    with open(filename, 'w') as f:
        f.write("id,ACTION\n")
        for i, pred in enumerate(predictions):
            f.write("%d,%f\n" % (i + 1, pred))

In [29]:
filename = raw_input("Name for submission file: ")
create_submissionfile(predictions, filename + ".csv")

Name for submission file: LogReg6


In [21]:
#Create Learning curve
train_sizes, train_scores, valid_scores = learning_curve(
LogisticRegression(C=1, penalty='l2', max_iter = 126, 
                   verbose = 6, solver = 'liblinear'), X, y, 
                   train_sizes=[200, 400, 600, 800, 1000], cv=10)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [22]:
#ROC_AUC plot, code from sklearn
plt.title('Receiver Operating Characteristic')
plt.plot(best_fpr, best_tpr, 'b',
label='AUC = %0.2f'% best_roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [23]:
#Learning curve code from sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves (Logistic Regression)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(X.shape[0], n_iter=100,
                                   test_size=0.2, random_state=0)

estimator = LogisticRegression(C=1, penalty='l2', max_iter = 183, 
                               verbose = 7, solver = 'liblinear')
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

plt.show()