In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pylab as plt
import xgboost as xgb


%matplotlib inline
plt.style.use('ggplot')

In [3]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv").drop("Unnamed: 0", axis=1)
train_target = pd.read_csv("train_target.csv")
print("Shapes of data: train_data - {}, test_data - {}, train_target - {}".format(train_data.shape, test_data.shape, train_target.shape))
print("Proportion train/test: ", int(train_data.shape[0]/test_data.shape[0]))

Shapes of data: train_data - (27595, 20), test_data - (13593, 20), train_target - (27595, 1)
Proportion train/test:  2


In [None]:
yes_indexes = train_target[train_target["1"]==1].index
no_indexes = train_target[train_target["1"]==0].index

In [None]:
new_train_yes = train_data.iloc[yes_indexes]
new_train_no = train_data.iloc[no_indexes][:int(1.5*len(new_train_yes))]
us_train = pd.concat([new_train_yes, new_train_no], axis=0).reset_index(drop=True)

new_train_target_yes = train_target.iloc[new_train_yes.index]
new_train_target_no = train_target.iloc[new_train_no.index]
us_target = pd.concat([new_train_target_yes, new_train_target_no], axis=0).reset_index(drop=True)
us_target.columns=["target"]

In [4]:
seed = 4767

### Preprocessing

In [5]:
from pandas import get_dummies
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [6]:
def preprocess_target(target):
    return np.array(train_target).reshape(train_target.shape[0],)

def preprocess(data):
    
    for col_to in ['housing', 'default', 'loan'] :
        data[col_to] = data[col_to].map(lambda x: 1 if x == 'yes' else 0)

    categorical_cols = ["job", "marital", "education", "contact", "month", "day_of_week", "poutcome"]
    cat_data = pd.get_dummies(data[categorical_cols])
    
    num_data = data.drop(categorical_cols, axis=1)
    num_data = pd.DataFrame(MinMaxScaler().fit_transform(num_data), columns = num_data.columns)
    
    return pd.concat([cat_data, num_data], axis=1).drop(['previous'], axis = 1)

### Undersampling

In [7]:
from imblearn.under_sampling import RandomUnderSampler

In [8]:
def undersample(data, target):
    random_us = RandomUnderSampler(random_state=seed)
    data_, target_ = random_us.fit_sample(data, target['target'].values.ravel())
    return pd.DataFrame(data_, columns=data.columns), pd.DataFrame(target_, columns=target.columns)

### kNN

In [9]:
from sklearn.cluster import KMeans

In [10]:
def get_kMeans_features(data, k_range):
    kMeans_meta_features = pd.DataFrame()

    for i in k_range:
        clr = KMeans(n_clusters=i, verbose=100, n_jobs=2)
        clr.fit(data)
        kMeans_meta_features[str(i)+"Means"] = [str(i) for i in clr.labels_]
    return pd.get_dummies(kMeans_meta_features)

### t-SNE

In [82]:
from MulticoreTSNE import MulticoreTSNE as TSNE

In [83]:
def get_tsne(data, labels):
    tsne = TSNE(n_jobs=15)
    train_tsne = tsne.fit_transform(np.array(data))
    plt.figure(figsize=(16, 9))
    plt.scatter(train_tsne[:,0], train_tsne[:,1], c = np.array(labels))
    ax = plt.axis('off')

In [87]:
fh_data = get_feature_hash(scaled_train)

In [88]:
get_tsne(fh_data, train_target.values)

AssertionError: X should be 2D array.

### FS with RF

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [11]:
def get_features_rf(data, target, select_model):
    for_seleсt = select_model.fit(data, target)
    model = SelectFromModel(for_seleсt, prefit=True)
    selected_train = model.transform(data)
    print("Old shape: {}, new shape: {}".format(data.shape, selected_train.shape))
    return selected_train

### train data

In [35]:
scaled_train = preprocess(train_data)
test_data = preprocess(test_data)

### SVC

In [12]:
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC, OneClassSVM, LinearSVC
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [13]:
from matplotlib.colors import Normalize

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

In [14]:
def train_cv_lsvm(data, target, folds=5):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state =442, stratify=target)
    
    C_range = np.logspace(-2, 5, 3)
    gamma_range = np.logspace(-9, 3, 5)
    svm_grid = {'C': C_range, 
                'gamma': gamma_range}
    
    svm_grid= GridSearchCV(SVC(kernel='rbf', probability=True), param_grid=svm_grid, cv=folds, n_jobs=4, verbose=2)
    svm_grid.fit(X_train, y_train)
    
    predictions = svm_grid.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    
    for params, mean_score, scores in svm_grid.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    

    scores = [x[1] for x in svm_grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    
    plt.figure(figsize = (9,9))
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('$\gamma$', fontsize="xx-large")
    plt.ylabel('$C$', rotation=0, fontsize="xx-large")
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=0)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.show()
    
def lsvm(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state =442, stratify=target)
    
    svm = LinearSVC(C=15,class_weight='balanced')
    svm.fit(X_train, y_train)
    
    predictions = svm.predict(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions, pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))

### xgb

In [15]:
def xgb_best_params(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=442, stratify=target)
    
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    gbm.fit(X_train, y_train, eval_metric ='auc')
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [16]:
def xgb_grid_search(data, target, skf=True):
    
    if skf:
        skf = StratifiedKFold(np.array(target), n_folds=5, random_state=seed)
        for train_index, test_index in skf:
            X_train, X_test = data[train_index], data[test_index]
            y_train, y_test = target[train_index], target[test_index]
    else: 
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    gbm_params = {   
        "n_estimators":[]
        
    }
    print("GridSeachCV proceeding...")
    
    
    gbm_gs = GridSearchCV(xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15), 
                          gbm_params, n_jobs=15, cv=5, verbose=2)
    
    gbm_gs.fit(X_train, y_train)
    print("Done.")
    
    """skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm_gs.best_estimator_, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))"""
    
    for params, mean_score, scores in gbm_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", gbm_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = gbm_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [None]:
train_with_knn = pd.concat([scaled_train, knn_meta], axis = 1)

In [None]:
fs_train = get_features_rf(scaled_train, train_target, RandomForestClassifier())

In [None]:
xgb_best_params(np.array(train_with_knn), np.array(train_target).ravel())

In [None]:
xgb_best_params(np.array(fs_train), np.array(train_target).ravel())

In [23]:
xgb_best_params(np.array(scaled_train), np.array(train_target).ravel())

AUC-ROC on test: 0.9513
fold #0: 0.952265
fold #1: 0.948057
fold #2: 0.942338
fold #3: 0.947032
fold #4: 0.953863
mean AUC-ROC on cv : 94.8711% (0.4075%)


In [None]:
xgb_grid_search(np.array(scaled_train), np.array(train_target).ravel())

In [None]:
xgb_grid_search(np.array(scaled_train), np.array(train_target).ravel())

### AdaBoostClf

In [46]:
from sklearn.ensemble import AdaBoostClassifier

In [18]:
def abclf(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = AdaBoostClassifier(learning_rate=0.3, algorithm='SAMME.R')

    gbm.fit(X_train, y_train)
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))
    

In [35]:
abclf(np.array(scaled_train), np.array(train_target).ravel())

AUC-ROC on test: 0.9384
fold #0: 0.940324
fold #1: 0.937006
fold #2: 0.933414
fold #3: 0.933394
fold #4: 0.939542
mean AUC-ROC on cv : 93.6736% (0.2934%)


### SVC

In [19]:
from sklearn.svm import SVC


In [20]:
def svcclf(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = SVC(C=0.029, kernel='linear', class_weight='balanced', decision_function_shape= 'ovo', probability=True)
    
    gbm.fit(X_train, y_train)
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [42]:
svcclf(np.array(scaled_train), np.array(train_target).ravel())

AUC-ROC on test: 0.9251
fold #0: 0.929191
fold #1: 0.920118
fold #2: 0.918789
fold #3: 0.921984
fold #4: 0.929590
mean AUC-ROC on cv : 92.3934% (0.4571%)


### Vclf

In [21]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [22]:
def vote_ensemble(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    ada_bclf = AdaBoostClassifier(learning_rate=0.3, algorithm='SAMME.R')
    xgb_best = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    dt = DecisionTreeClassifier(max_features='auto', criterion = 'entropy', splitter='best', class_weight='balanced', presort=True)
    
    eclf = VotingClassifier(estimators=[('xgb', xgb_best), ('adaclf', ada_bclf), ('dt', dt)], weights=[1, 1, 1], voting='soft')

    eclf.fit(X_train, y_train)
    
    predictions = eclf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(eclf, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))


In [87]:
scaled_train__ = np.array(preprocess(train_data))

In [105]:
vote_ensemble(scaled_train__, np.array(train_target).ravel())

AUC-ROC on test: 0.9374
fold #0: 0.938870
fold #1: 0.935925
fold #2: 0.933780
fold #3: 0.937186
fold #4: 0.941877
mean AUC-ROC on cv : 93.7527% (0.2737%)


In [23]:
from sklearn.linear_model import LogisticRegression

In [92]:
def ensemble(data, target):

    np.random.seed(0)  # seed to shuffle the train set
    n_folds = 3
    verbose = True
    shuffle = False

    X, y, = np.array(data), np.array(target).ravel()
    X, x_test, y, y_test_ = train_test_split(X, y, test_size=0.2, stratify=y)
    
    skf = list(StratifiedKFold(y, n_folds))

    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)

    clfs = [gbm, CalibratedClassifierCV(gbm, method='isotonic', cv=10), 
                   SGDClassifier(penalty='l1', loss='log', verbose=1, n_jobs=15, n_iter=1000)]
    

    print("Creating train and test sets for blending.")

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((x_test.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print(j, clf)
        dataset_blend_test_j = np.zeros((x_test.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print("Fold", i)
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            clf.fit(X_train, y_train)
            y_pred = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_pred
            dataset_blend_test_j[:, i] = clf.predict(x_test)
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
    
    #print(dataset_blend_train)
    print("Blending")
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    
    predictions = clf.predict_proba(dataset_blend_test)
    print(predictions)
    print(predictions.shape)
    fpr, tpr, thresholds = roc_curve(y_test, predictions, pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [93]:
ensemble(np.array(scaled_train), train_target.values)


Creating train and test sets for blending.
0 XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.0, learning_rate=0.02, max_delta_step=9, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1200, nthread=15,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=0.05,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
Fold 0
Fold 1
Fold 2
1 CalibratedClassifierCV(base_estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.0, learning_rate=0.02, max_delta_step=9, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1200, nthread=15,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=0.05,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
            cv=10, method='isotonic')
Fold 0
Fold 1
Fold 2
2 SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learni

ValueError: Found arrays with inconsistent numbers of samples: [5519 7358]

In [25]:
def write_to_submission_file(predicted_labels, out_file='output1.csv',
                             target='Prediction', index_label="Id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(0, predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [71]:
gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
gbm.fit(np.array(scaled_train__), np.array(train_target).ravel())

predictions = gbm.predict_proba(np.array(test_data__))
write_to_submission_file(predictions[:, 1])

In [70]:
test_data__ = preprocess(test_data)
scaled_train__ = preprocess(train_data)

###  Bagging skl

In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV

In [27]:
def bag_xgb(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    
    calibrated_clf = CalibratedClassifierCV(gbm, method='sigmoid', cv=10)
    calibrated_clf.fit(X_train, y_train)
    y_preds = calibrated_clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, y_preds[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    
    results = cross_val_score(calibrated_clf, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [39]:
bag_xgb(np.array(scaled_train), np.array(train_target).ravel())

AUC-ROC on test: 0.9487
fold #0: 0.952121
fold #1: 0.947744
fold #2: 0.942238
fold #3: 0.947214
fold #4: 0.954539
mean AUC-ROC on cv : 94.8771% (0.4258%)


In [28]:
def bag_grid_search(data, target, skf=False):
    
    if skf:
        skf = StratifiedKFold(np.array(target), n_folds=5, random_state=seed)
        for train_index, test_index in skf:
            X_train, X_test = data[train_index], data[test_index]
            y_train, y_test = target[train_index], target[test_index]
    else: 
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state=seed)
    
    bag_params = {   
        "max_samples":[5, 10, 25],
        "max_features": [2, 3, 6, 10]
        
    }
    print("GridSeachCV proceeding...")
    bag_gs = GridSearchCV(BaggingClassifier(xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15), n_estimators=25),
                                            bag_params, n_jobs=15, cv=5, verbose=2)
    bag_gs.fit(X_train, y_train)
    print("Done.")
    
    """skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm_gs.best_estimator_, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))"""
    
    for params, mean_score, scores in bag_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", bag_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = bag_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

### Feature hashing

In [29]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import SGDClassifier

In [86]:
def get_feature_hash(data):
    raw_X_train = [dict(row[1]) for row in data.iterrows()]
    fh = FeatureHasher(n_features = 2 ** 20)
    return fh.transform(raw_X_train)

In [70]:
xgb_best_params(X_enc_train, np.array(train_target).ravel())

AUC-ROC on test: 0.9511
fold #0: 0.952414
fold #1: 0.947345
fold #2: 0.942034
fold #3: 0.946673
fold #4: 0.954516
mean AUC-ROC on cv : 94.8597% (0.4426%)


array([[  9.95897353e-01,   4.10267152e-03],
       [  9.98713255e-01,   1.28675019e-03],
       [  9.99953508e-01,   4.64843688e-05],
       ..., 
       [  9.97954845e-01,   2.04516039e-03],
       [  4.62053955e-01,   5.37946045e-01],
       [  9.96202111e-01,   3.79788899e-03]], dtype=float32)

### sgd

In [31]:
def train_sgd(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state = 442)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
              
    pred = sgd.predict_proba(X_test)
        
    fpr, tpr, thresholds = roc_curve(y_test, pred[:, 1], pos_label=1)
    print("AUC-ROC on test : %.4g" % auc(fpr, tpr))
              
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    
    results = cross_val_score(sgd, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))

In [85]:
train_sgd(scaled_train, np.array(train_target).ravel())

AUC-ROC on test : 0.926
fold #0: 0.927230
fold #1: 0.915521
fold #2: 0.915017
fold #3: 0.924965
fold #4: 0.924042


In [32]:
def sgd_grid_search(data, target, skf=True):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    sgd_params = { 
       # "eta0":[]
    }
    print("GridSeachCV proceeding...")
    sgd_gs = GridSearchCV(SGDClassifier(penalty='l1', alpha=0.00001, loss='log', verbose=1, n_jobs=15, l1_ratio=0.1,
                                        random_state=seed, n_iter=200, class_weight='balanced', eta0=0.1), 
                          sgd_params, n_jobs=15, cv=5, verbose=1)
    
    sgd_gs.fit(X_train, y_train)
    print("Done.")
    
    for params, mean_score, scores in sgd_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", sgd_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = sgd_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [122]:
sgd_grid_search(np.array(scaled_train), np.array(train_target).ravel())

GridSeachCV proceeding...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
-- Epoch 1
Norm: 1229.63, NNZs: 50, Bias: 33.536192, T: 17660, Avg. loss: 28.678446
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 1
Norm: 1169.95, NNZs: 49, Bias: 19.954648, T: 35320, Avg. loss: 19.950871
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 1235.68, NNZs: 54, Bias: 13.517629, T: 17661, Avg. loss: 28.241764
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 1
Norm: 1088.70, NNZs: 55, Bias: 13.114395, T: 52980, Avg. loss: 16.233890
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 1152.87, NNZs: 51, Bias: -6.473804, T: 35322, Avg. loss: 19.513445
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 1229.68, NNZs: 52, Bias: 17.009393, T: 17661, Avg. loss: 29.438872
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 1
Norm: 1072.02, NNZs: 45, Bias: 2.031997, T: 52983, Avg. loss: 15.820317
Norm: 1019.62, NNZs: 47, Bias: 11.475307, T: 70640, Avg. loss: 14.053114
Total traini

[Parallel(n_jobs=15)]: Done   5 out of   5 | elapsed:    3.3s finished


-- Epoch 1
Norm: 1274.03, NNZs: 50, Bias: 23.536003, T: 22076, Avg. loss: 26.940176
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 1161.39, NNZs: 45, Bias: 11.968172, T: 44152, Avg. loss: 18.498645
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 1065.11, NNZs: 46, Bias: 9.055951, T: 66228, Avg. loss: 15.034741
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 989.60, NNZs: 48, Bias: 7.366394, T: 88304, Avg. loss: 12.979576
Total training time: 0.04 seconds.
-- Epoch 5
Norm: 926.94, NNZs: 43, Bias: 5.391675, T: 110380, Avg. loss: 11.589809
Total training time: 0.05 seconds.
-- Epoch 6
Norm: 876.90, NNZs: 46, Bias: 0.857440, T: 132456, Avg. loss: 10.551487
Total training time: 0.06 seconds.
-- Epoch 7
Norm: 834.68, NNZs: 40, Bias: 4.447990, T: 154532, Avg. loss: 9.721561
Total training time: 0.07 seconds.
-- Epoch 8
Norm: 799.06, NNZs: 38, Bias: 2.230651, T: 176608, Avg. loss: 9.059549
Total training time: 0.09 seconds.
-- Epoch 9
Norm: 767.95, NNZs: 42, Bias: 2.207937, T: 198

In [123]:
sgd_grid_search(X_enc_train, np.array(train_target).ravel())

GridSeachCV proceeding...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
-- Epoch 1
-- Epoch 1
Norm: 1217.70, NNZs: 51, Bias: 0.375560, T: 17660, Avg. loss: 26.858671
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 1169.04, NNZs: 53, Bias: -0.112583, T: 17661, Avg. loss: 25.139006
Total training time: 0.02 seconds.
-- Epoch 1
-- Epoch 2
-- Epoch 1
-- Epoch 1
Norm: 1151.62, NNZs: 52, Bias: 0.154923, T: 35320, Avg. loss: 18.626987
Total training time: 0.05 seconds.
-- Epoch 3
Norm: 1162.21, NNZs: 54, Bias: 0.233055, T: 17661, Avg. loss: 26.082501
Total training time: 0.02 seconds.
Norm: 1085.90, NNZs: 53, Bias: 0.025171, T: 35322, Avg. loss: 17.616682
Total training time: 0.04 seconds.
Norm: 1177.82, NNZs: 54, Bias: 0.121801, T: 17661, Avg. loss: 27.604291
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 3
Norm: 1140.42, NNZs: 54, Bias: 0.171922, T: 17661, Avg. loss: 26.277490
-- Epoch 2
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 1066.88, NNZs: 50, Bias: 0.

[Parallel(n_jobs=15)]: Done   5 out of   5 | elapsed:    4.9s finished


-- Epoch 1
Norm: 1208.17, NNZs: 53, Bias: 0.562659, T: 22076, Avg. loss: 24.410646
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 1092.57, NNZs: 50, Bias: 0.278286, T: 44152, Avg. loss: 17.072055
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 993.10, NNZs: 48, Bias: 0.292836, T: 66228, Avg. loss: 13.941772
Total training time: 0.05 seconds.
-- Epoch 4
Norm: 915.39, NNZs: 52, Bias: 0.230764, T: 88304, Avg. loss: 12.040762
Total training time: 0.07 seconds.
-- Epoch 5
Norm: 851.44, NNZs: 45, Bias: 0.214302, T: 110380, Avg. loss: 10.739532
Total training time: 0.08 seconds.
-- Epoch 6
Norm: 800.40, NNZs: 49, Bias: 0.155129, T: 132456, Avg. loss: 9.768574
Total training time: 0.10 seconds.
-- Epoch 7
Norm: 757.89, NNZs: 41, Bias: 0.171619, T: 154532, Avg. loss: 8.990158
Total training time: 0.12 seconds.
-- Epoch 8
Norm: 722.07, NNZs: 43, Bias: 0.141762, T: 176608, Avg. loss: 8.369027
Total training time: 0.13 seconds.
-- Epoch 9
Norm: 690.81, NNZs: 42, Bias: 0.110347, T: 198684,

In [None]:
sgd = SGDClassifier(penalty='l1', alpha=0.00001, loss='log', verbose=1, n_jobs=15, l1_ratio=0.1,
                                        random_state=seed, n_iter=10, class_weight='balanced', eta0=0.1)

sgd.fit(np.array(scaled_train), np.array(train_target).ravel())

predictions = sgd.predict_proba(np.array(test_data__))
write_to_submission_file(predictions[:, 1])

### Weights

In [33]:
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [75]:
def get_weights(data, target, models):

    ### we need a test set that we didn't train on to find the best weights for combining the classifiers
    sss = StratifiedShuffleSplit(target, test_size=0.2, random_state=1234)
    for train_index, test_index in sss:
        break

    train_x, train_y = data.values[train_index], data.values[train_index]
    test_x, test_y = data.values[test_index], data.values[test_index]

    ### building the classifiers
    clfs = []
    for cur_model in models:
        
        model = cur_model.fit(train_x, train_y)
        print(str(cur_model)[:4]+'LogLoss {score}'.format(score=log_loss(test_y, cur_model.predict_proba(test_x))))
        clfs.append(model)


    ### finding the optimum weights

    predictions = []
    pred_df = pd.DataFrame()
    for clf in clfs:
        preds = clf.predict_proba(test_x)
        predictions.append(preds)
        fpr, tpr, thresholds = roc_curve(test_y, preds[:, 1], pos_label=1)
        print("AUC : %.4g" % auc(fpr, tpr))
    
        
        
        

    def log_loss_func(weights):
        ''' scipy minimize will pass the weights as a numpy array '''
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
                final_prediction += weight*prediction

        return log_loss(test_y, final_prediction)
    
    """corr_data = pred_df.corr()

    plt.figure(figsize = (16, 16))
    plt.imshow(corr_data,cmap='seismic',interpolation='none',vmin=-1,vmax=1)
    plt.colorbar()
    ax = plt.xticks(range(len(corr_data)),corr_data.columns, rotation=70)
    ax = plt.yticks(range(len(corr_data)),corr_data.columns)
"""
    #the algorithms need a starting value, right not we chose 0.5 for all weights
    #its better to choose many random starting points and run minimize a few times
    starting_values = [0.5]*len(predictions)

    #adding constraints  and a different solver as suggested by user 16universe
    #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    #our weights are bound between 0 and 1
    bounds = [(0,1)]*len(predictions)

    res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

    print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
    print('Best Weights: {weights}'.format(weights=res['x']))

In [76]:
gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)

best_models = [CalibratedClassifierCV(gbm, method='isotonic', cv=10),
              SGDClassifier(penalty='l1', loss='log', verbose=1, n_jobs=15, n_iter=1000)]

get_weights(scaled_train, train_target.values, best_models)

  y = column_or_1d(y, warn=True)


CaliLogLoss 0.16791837166512824
-- Epoch 1
Norm: 130.71, NNZs: 20, Bias: -2.272459, T: 22076, Avg. loss: 0.962789
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 128.20, NNZs: 24, Bias: -2.285212, T: 44152, Avg. loss: 0.628441
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 127.52, NNZs: 21, Bias: -2.126146, T: 66228, Avg. loss: 0.500033
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 127.39, NNZs: 29, Bias: -1.875103, T: 88304, Avg. loss: 0.432051
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 127.39, NNZs: 32, Bias: -1.822514, T: 110380, Avg. loss: 0.390329
Total training time: 0.06 seconds.
-- Epoch 6
Norm: 127.45, NNZs: 29, Bias: -1.828705, T: 132456, Avg. loss: 0.361924
Total training time: 0.07 seconds.
-- Epoch 7
Norm: 127.52, NNZs: 27, Bias: -1.810243, T: 154532, Avg. loss: 0.341188
Total training time: 0.08 seconds.
-- Epoch 8
Norm: 127.59, NNZs: 30, Bias: -1.544189, T: 176608, Avg. loss: 0.325555
Total training time: 0.10 seconds.
-- Epoch 9
Norm: 127.65, NNZ

  y = column_or_1d(y, warn=True)


Norm: 127.97, NNZs: 31, Bias: -1.545755, T: 375292, Avg. loss: 0.266541
Total training time: 0.20 seconds.
-- Epoch 18
Norm: 128.00, NNZs: 32, Bias: -1.562225, T: 397368, Avg. loss: 0.263577
Total training time: 0.21 seconds.
-- Epoch 19
Norm: 128.03, NNZs: 32, Bias: -1.513975, T: 419444, Avg. loss: 0.260880
Total training time: 0.23 seconds.
-- Epoch 20
Norm: 128.05, NNZs: 30, Bias: -1.533959, T: 441520, Avg. loss: 0.258476
Total training time: 0.24 seconds.
-- Epoch 21
Norm: 128.07, NNZs: 31, Bias: -1.449170, T: 463596, Avg. loss: 0.256290
Total training time: 0.25 seconds.
-- Epoch 22
Norm: 128.09, NNZs: 32, Bias: -1.619371, T: 485672, Avg. loss: 0.254289
Total training time: 0.26 seconds.
-- Epoch 23
Norm: 128.11, NNZs: 35, Bias: -1.549182, T: 507748, Avg. loss: 0.252464
Total training time: 0.27 seconds.
-- Epoch 24
Norm: 128.13, NNZs: 37, Bias: -1.510343, T: 529824, Avg. loss: 0.250779
Total training time: 0.28 seconds.
-- Epoch 25
Norm: 128.15, NNZs: 34, Bias: -1.486824, T: 5519

In [78]:
def stack_gen(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    
    sss_ = StratifiedShuffleSplit(y_train, test_size=0.2, random_state=1234)
    for train_index, test_index in sss_:
        break
    
    train1, target1 = X_train.values[train_index], y_train.values[train_index]
    train2, target2 = X_train.values[test_index], y_train.values[test_index]
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    gbm.fit(train1, target1)
    preds_train1 = gbm.predict_proba(train2)
    
    gbm.fit(train2, target2)
    preds_train2 = gbm.predict_proba(train1)
    
    gbm.fit(X_train, y_train)
    preds_train = gbm.predict_proba(X_test)
    
    