In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pylab as plt
import xgboost as xgb
import sklearn


%matplotlib inline
plt.style.use('ggplot')

In [2]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv").drop("Unnamed: 0", axis=1)
train_target = pd.read_csv("train_target.csv")
print("Shapes of data: train_data - {}, test_data - {}, train_target - {}".format(train_data.shape, test_data.shape, train_target.shape))
print("Proportion train/test: ", int(train_data.shape[0]/test_data.shape[0]))

seed = 4767

Shapes of data: train_data - (27595, 20), test_data - (13593, 20), train_target - (27595, 1)
Proportion train/test:  2


In [None]:
yes_indexes = train_target[train_target["1"]==1].index
no_indexes = train_target[train_target["1"]==0].index

In [None]:
new_train_yes = train_data.iloc[yes_indexes]
new_train_no = train_data.iloc[no_indexes][:int(1.5*len(new_train_yes))]
us_train = pd.concat([new_train_yes, new_train_no], axis=0).reset_index(drop=True)

new_train_target_yes = train_target.iloc[new_train_yes.index]
new_train_target_no = train_target.iloc[new_train_no.index]
us_target = pd.concat([new_train_target_yes, new_train_target_no], axis=0).reset_index(drop=True)
us_target.columns=["target"]

### Preprocessing

In [3]:
from pandas import get_dummies
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [4]:
def preprocess_target(target):
    return np.array(train_target).reshape(train_target.shape[0],)

def preprocess(data):
    
    for col_to in ['housing', 'default', 'loan'] :
        data[col_to] = data[col_to].map(lambda x: 1 if x == 'yes' else 0)

    categorical_cols = ["job", "marital", "education", "contact", "month", "day_of_week", "poutcome"]
    cat_data = pd.get_dummies(data[categorical_cols])
    
    num_data = data.drop(categorical_cols, axis=1)
    num_data = pd.DataFrame(MinMaxScaler().fit_transform(num_data), columns = num_data.columns)
    
    return pd.concat([cat_data, num_data], axis=1).drop(['previous'], axis = 1)

### Undersampling

In [5]:
def undersample(data, target):
    random_us = RandomUnderSampler(random_state=seed)
    data_, target_ = random_us.fit_sample(data, target['target'].values.ravel())
    return pd.DataFrame(data_, columns=data.columns), pd.DataFrame(target_, columns=target.columns)

### kNN

In [6]:
def get_kMeans_features(data, k_range):
    kMeans_meta_features = pd.DataFrame()

    for i in k_range:
        clr = KMeans(n_clusters=i, verbose=100, n_jobs=2)
        clr.fit(data)
        kMeans_meta_features[str(i)+"Means"] = [str(i) for i in clr.labels_]
    return pd.get_dummies(kMeans_meta_features)

### t-SNE

In [7]:
from MulticoreTSNE import MulticoreTSNE as TSNE
from scipy.sparse import csc_matrix

In [8]:
def get_tsne(data, labels):
    tsne = TSNE(n_jobs=15)
    train_tsne = tsne.fit_transform(np.array(data))
    plt.figure(figsize=(16, 9))
    plt.scatter(train_tsne[:,0], train_tsne[:,1], c = np.array(labels))
    ax = plt.axis('off')
    return train_tsne

In [9]:
def sparse_to_df(sp_mx):

    fh_df = sp_mx.tocoo(copy=False)

    return pd.DataFrame({'index': fh_df.row, 'col': fh_df.col, 'data': fh_df.data}
                 )[['index', 'col', 'data']].sort_values(['index', 'col']
                 ).reset_index(drop=True)
    

### FS with RF

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [11]:
def get_features_rf(data, target, select_model):
    for_seleсt = select_model.fit(data, target)
    model = SelectFromModel(for_seleсt, prefit=True)
    selected_train = model.transform(data)
    print("Old shape: {}, new shape: {}".format(data.shape, selected_train.shape))
    return selected_train

5xgb
5knn
5calibr
5sgd

по 20 предикшнов на каждый

80 фичей - рфе до 15

затем 

### train data

In [12]:
scaled_train = preprocess(train_data)
test_data = preprocess(test_data)

### SVC

In [13]:
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC, OneClassSVM, LinearSVC
from sklearn.metrics import roc_curve, auc, roc_auc_score



In [14]:
from matplotlib.colors import Normalize

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

In [15]:
def train_cv_lsvm(data, target, folds=5):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state =442, stratify=target)
    
    C_range = np.logspace(-2, 5, 3)
    gamma_range = np.logspace(-9, 3, 5)
    svm_grid = {'C': C_range, 
                'gamma': gamma_range}
    
    svm_grid= GridSearchCV(SVC(kernel='rbf', probability=True), param_grid=svm_grid, cv=folds, n_jobs=4, verbose=2)
    svm_grid.fit(X_train, y_train)
    
    predictions = svm_grid.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    
    for params, mean_score, scores in svm_grid.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    

    scores = [x[1] for x in svm_grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    
    plt.figure(figsize = (9,9))
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('$\gamma$', fontsize="xx-large")
    plt.ylabel('$C$', rotation=0, fontsize="xx-large")
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=0)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.show()
    
def lsvm(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state =442, stratify=target)
    
    svm = LinearSVC(C=15,class_weight='balanced')
    svm.fit(X_train, y_train)
    
    predictions = svm.predict(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions, pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))

### xgb

In [16]:
def xgb_best_params(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=442, stratify=target)
    
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    gbm.fit(X_train, y_train, eval_metric ='auc')
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [17]:
def xgb_grid_search(data, target, skf=True):
    
    if skf:
        skf = StratifiedKFold(np.array(target), n_folds=5, random_state=seed)
        for train_index, test_index in skf:
            X_train, X_test = data[train_index], data[test_index]
            y_train, y_test = target[train_index], target[test_index]
    else: 
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    gbm_params = {   
        "n_estimators":[]
        
    }
    print("GridSeachCV proceeding...")
    
    
    gbm_gs = GridSearchCV(xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15), 
                          gbm_params, n_jobs=15, cv=5, verbose=2)
    
    gbm_gs.fit(X_train, y_train)
    print("Done.")
    
    """skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm_gs.best_estimator_, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))"""
    
    for params, mean_score, scores in gbm_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", gbm_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = gbm_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [None]:
train_with_knn = pd.concat([scaled_train, knn_meta], axis = 1)

In [None]:
fs_train = get_features_rf(scaled_train, train_target, RandomForestClassifier())

In [None]:
xgb_best_params(np.array(train_with_knn), np.array(train_target).ravel())

In [None]:
xgb_best_params(np.array(fs_train), np.array(train_target).ravel())

In [None]:
xgb_best_params(np.array(scaled_train), np.array(train_target).ravel())

In [None]:
xgb_grid_search(np.array(scaled_train), np.array(train_target).ravel())

In [None]:
xgb_grid_search(np.array(scaled_train), np.array(train_target).ravel())

### AdaBoostClf

In [18]:
from sklearn.ensemble import AdaBoostClassifier

In [19]:
def abclf(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = AdaBoostClassifier(learning_rate=0.3, algorithm='SAMME.R')

    gbm.fit(X_train, y_train)
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))
    

In [None]:
abclf(np.array(scaled_train), np.array(train_target).ravel())

### SVC

In [20]:
from sklearn.svm import SVC


In [21]:
def svcclf(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = SVC(C=0.029, kernel='linear', class_weight='balanced', decision_function_shape= 'ovo', probability=True)
    
    gbm.fit(X_train, y_train)
    
    predictions = gbm.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [None]:
svcclf(np.array(scaled_train), np.array(train_target).ravel())

### Vclf

In [22]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [23]:
def vote_ensemble(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    ada_bclf = AdaBoostClassifier(learning_rate=0.3, algorithm='SAMME.R')
    xgb_best = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    dt = DecisionTreeClassifier(max_features='auto', criterion = 'entropy', splitter='best', class_weight='balanced', presort=True)
    
    eclf = VotingClassifier(estimators=[('xgb', xgb_best), ('adaclf', ada_bclf), ('dt', dt)], weights=[1, 1, 1], voting='soft')

    eclf.fit(X_train, y_train)
    
    predictions = eclf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(eclf, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))


In [None]:
scaled_train__ = np.array(preprocess(train_data))

In [None]:
vote_ensemble(scaled_train__, np.array(train_target).ravel())

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
def ensemble(data, target):

    np.random.seed(0)  # seed to shuffle the train set
    n_folds = 3
    verbose = True
    shuffle = False

    X, y, = np.array(data), np.array(target).ravel()
    X, x_test, y, y_test_ = train_test_split(X, y, test_size=0.2, stratify=y)
    
    skf = list(StratifiedKFold(y, n_folds))

    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15,seed = seed*34)
    
    gbm2 = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15,seed = seed*61)

    clfs = [gbm, gbm2, CalibratedClassifierCV(gbm, method='isotonic', cv=10), 
                   SGDClassifier(penalty='l1', loss='log', verbose=1, n_jobs=15, n_iter=1000)]
    
    
    print("Creating train and test sets for blending.")

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((x_test.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print(j, clf)
        dataset_blend_test_j = np.zeros((x_test.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print("Fold", i)
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            clf.fit(X_train, y_train)
            y_pred = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_pred
            dataset_blend_test_j[:, i] = clf.predict(x_test)
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
    
    #print(dataset_blend_train)
    print("Blending")
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    
    predictions = clf.predict_proba(dataset_blend_test)
    print(predictions)
    print(predictions.shape)
    fpr, tpr, thresholds = roc_curve(y_test_, predictions[:,1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [None]:
ensemble(np.array(scaled_train), train_target.values)

In [None]:
ensemble(np.array(get_feature_hash(scaled_train)), train_target.values)

In [None]:
ensemble(np.array(get_features_rf(scaled_train)), train_target.values)

In [None]:
def write_to_submission_file(predicted_labels, out_file='output1.csv',
                             target='Prediction', index_label="Id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(0, predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [None]:
gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
gbm.fit(np.array(scaled_train__), np.array(train_target).ravel())

predictions = gbm.predict_proba(np.array(test_data__))
write_to_submission_file(predictions[:, 1])

In [None]:
test_data__ = preprocess(test_data)
scaled_train__ = preprocess(train_data)

###  Bagging skl

In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV

In [None]:
def bag_xgb(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,
                                                        random_state=seed, stratify=target)
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    
    calibrated_clf = CalibratedClassifierCV(gbm, method='sigmoid', cv=10)
    calibrated_clf.fit(X_train, y_train)
    y_preds = calibrated_clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, y_preds[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    
    results = cross_val_score(calibrated_clf, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

In [None]:
bag_xgb(np.array(scaled_train), np.array(train_target).ravel())

In [None]:
def bag_grid_search(data, target, skf=False):
    
    if skf:
        skf = StratifiedKFold(np.array(target), n_folds=5, random_state=seed)
        for train_index, test_index in skf:
            X_train, X_test = data[train_index], data[test_index]
            y_train, y_test = target[train_index], target[test_index]
    else: 
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state=seed)
    
    bag_params = {   
        "max_samples":[5, 10, 25],
        "max_features": [2, 3, 6, 10]
        
    }
    print("GridSeachCV proceeding...")
    bag_gs = GridSearchCV(BaggingClassifier(xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15), n_estimators=25),
                                            bag_params, n_jobs=15, cv=5, verbose=2)
    bag_gs.fit(X_train, y_train)
    print("Done.")
    
    """skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    results = cross_val_score(gbm_gs.best_estimator_, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))"""
    
    for params, mean_score, scores in bag_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", bag_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = bag_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

### Feature hashing

In [27]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import SGDClassifier

In [28]:
def get_feature_hash(data):
    raw_X_train = [dict(row[1]) for row in data.iterrows()]
    fh = FeatureHasher(n_features = 2 ** 20)
    return fh.transform(raw_X_train)

In [None]:
xgb_best_params(X_enc_train, np.array(train_target).ravel())

### sgd

In [29]:
def train_sgd(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state = 442)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
              
    pred = sgd.predict_proba(X_test)
        
    fpr, tpr, thresholds = roc_curve(y_test, pred[:, 1], pos_label=1)
    print("AUC-ROC on test : %.4g" % auc(fpr, tpr))
              
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    
    results = cross_val_score(sgd, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))

In [None]:
train_sgd(scaled_train, np.array(train_target).ravel())

In [30]:
def sgd_grid_search(data, target, skf=True):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    sgd_params = { 
       # "eta0":[]
    }
    print("GridSeachCV proceeding...")
    sgd_gs = GridSearchCV(SGDClassifier(penalty='l1', alpha=0.00001, loss='log', verbose=1, n_jobs=15, l1_ratio=0.1,
                                        random_state=seed, n_iter=200, class_weight='balanced', eta0=0.1), 
                          sgd_params, n_jobs=15, cv=5, verbose=1)
    
    sgd_gs.fit(X_train, y_train)
    print("Done.")
    
    for params, mean_score, scores in sgd_gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        
    print(101*"="+ "\nBEST PARAMETERS: ", sgd_gs.best_params_, "\n"+101*"="+"\n")
    
    predictions = sgd_gs.best_estimator_.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [None]:
sgd_grid_search(np.array(scaled_train), np.array(train_target).ravel())

In [None]:
sgd_grid_search(X_enc_train, np.array(train_target).ravel())

In [None]:
sgd = SGDClassifier(penalty='l1', alpha=0.00001, loss='log', verbose=1, n_jobs=15, l1_ratio=0.1,
                                        random_state=seed, n_iter=10, class_weight='balanced', eta0=0.1)

sgd.fit(np.array(scaled_train), np.array(train_target).ravel())

predictions = sgd.predict_proba(np.array(test_data__))
write_to_submission_file(predictions[:, 1])

### Weights

In [31]:
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [None]:
def get_weights(data, target, models):

    ### we need a test set that we didn't train on to find the best weights for combining the classifiers
    sss = StratifiedShuffleSplit(target, test_size=0.2, random_state=1234)
    for train_index, test_index in sss:
        break

    train_x, train_y = data.values[train_index], target[train_index]
    test_x, test_y = data.values[test_index], target[test_index]

    ### building the classifiers
    clfs = []
    for cur_model in models:
        
        model = cur_model.fit(train_x, train_y)
        print(str(cur_model)[:4]+'LogLoss {score}'.format(score=log_loss(test_y, cur_model.predict_proba(test_x))))
        clfs.append(model)


    ### finding the optimum weights

    predictions = []
    pred_df = pd.DataFrame()
    for clf in clfs:
        preds = clf.predict_proba(test_x)
        predictions.append(preds)
        fpr, tpr, thresholds = roc_curve(test_y, preds[:, 1], pos_label=1)
        print("AUC "+ str(clf)[:4]+ ": %.4g" % auc(fpr, tpr))
    
        
        
        

    def log_loss_func(weights):
        ''' scipy minimize will pass the weights as a numpy array '''
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
                final_prediction += weight*prediction

        return log_loss(test_y, final_prediction)
    
    """corr_data = pred_df.corr()

    plt.figure(figsize = (16, 16))
    plt.imshow(corr_data,cmap='seismic',interpolation='none',vmin=-1,vmax=1)
    plt.colorbar()
    ax = plt.xticks(range(len(corr_data)),corr_data.columns, rotation=70)
    ax = plt.yticks(range(len(corr_data)),corr_data.columns)
"""
    #the algorithms need a starting value, right not we chose 0.5 for all weights
    #its better to choose many random starting points and run minimize a few times
    starting_values = [0.5]*len(predictions)

    #adding constraints  and a different solver as suggested by user 16universe
    #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    #our weights are bound between 0 and 1
    bounds = [(0,1)]*len(predictions)

    res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

    print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
    print('Best Weights: {weights}'.format(weights=res['x']))
    w = weights=res['x']
    return w

In [None]:
gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)

best_models = [CalibratedClassifierCV(gbm, method='isotonic', cv=10),
              SGDClassifier(penalty='l1', loss='log', verbose=1, n_jobs=15, n_iter=1000)]

get_weights(scaled_train, train_target.values, best_models)

In [None]:
def stack_gen(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    
    sss_ = StratifiedShuffleSplit(y_train, test_size=0.2, random_state=1234)
    for train_index, test_index in sss_:
        break
    
    train1, target1 = X_train.values[train_index], y_train.values[train_index]
    train2, target2 = X_train.values[test_index], y_train.values[test_index]
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    
    gbm.fit(train1, target1)
    preds_train1 = gbm.predict_proba(train2)
    
    gbm.fit(train2, target2)
    preds_train2 = gbm.predict_proba(train1)
    
    gbm.fit(X_train, y_train)
    preds_train = gbm.predict_proba(X_test)
    
    

In [193]:
from sklearn.neural_network import MLPClassifier

In [None]:
def mlp(data, target):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=seed)
    
    
    clf2 = MLPClassifier(alpha=0.001, hidden_layer_sizes=(400, 400, 400, 400),
                         random_state=1, activation='logistic', max_iter = 500)
   
    clf2.fit(X_train, y_train)
    print("Done.")
    
    predictions = clf2.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1], pos_label=1)
    print("AUC : %.4g" % auc(fpr, tpr))
    

In [None]:
mlp(np.array(scaled_train), train_target.values)

In [None]:
fh_data = get_feature_hash(scaled_train)


In [None]:
rf_fh_data = get_features_rf(fh_data, train_target.values, RandomForestClassifier(n_jobs=15))

In [None]:
mlp(rf_fh_data, train_target.values)

In [None]:
from scipy.stats import gmean
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
def print_corr_matrix(clfs, X):
    res = np.vstack([x.predict_proba(X)[:, 1] for x in clfs])
    mat = np.corrcoef(res)
    print("clf", end = "\t")
    #print("\t".join([x for x in clfs]))
    for i in range(len(clfs)):
        print(clfs[i], end = "\t")
        print("\t".join(map(lambda x: str(round(x, 3)), mat[i, :])))
    plt.pcolor(mat, cmap = plt.cm.RdBu)
    plt.show()

def averaging(data, target, test):
    
    clfs = []
    X_train, X_test, y_train, y_test = train_test_split(np.array(data), target, test_size=0.2, 
                                                        random_state=seed, stratify=target)
    """for i in [433242, 54332, 7856, 34363, 965]:
        
        cur_clf = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=16, seed=i)
        clfs.append(cur_clf)
        """
    #clfs.append(MLPClassifier(alpha=0.0001, hidden_layer_sizes=(300, 200, 400, 400, 500, 200),
                         #random_state=seed, activation='logistic', max_iter = 300))
    
    clfs.append(SVC(C=0.029, kernel='linear', class_weight='balanced', decision_function_shape= 'ovo', probability=True))
    
    gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15, seed = seed)
    
    
    clfs.append(gbm)
    #clfs.append(gbm)
    
    #clfs.append(CalibratedClassifierCV(gbm, method='sigmoid', cv=15))
    
    
    weights = get_weights(data, target, clfs)
     

    for clf in clfs:
        clf.fit(X_train, y_train)
        
    res = np.vstack([x.predict_proba(X_test)[:, 1] for x in clfs])
    y_pred1 = res.mean(axis=0)
    print("Score (amean):\t", roc_auc_score(y_test, y_pred1))
    y_pred2 = gmean(res, 0)
    print("Score (gmean):\t", roc_auc_score(y_test, y_pred2))
    
    res1 = np.vstack([x.predict_proba(test)[:, 1] for x in clfs])
    preds = gmean(res1, 0)
    
    
    res = np.vstack([cur_w*cur_clf.predict_proba(X_test)[:, 1] for cur_clf, cur_w in zip(clfs, weights)])
    y_pred1 = res.mean(axis=0)
    print("Score weighted (amean):\t", roc_auc_score(y_test, y_pred1))
    y_pred2 = gmean(res, 0)
    print("Score weighted (gmean):\t", roc_auc_score(y_test, y_pred2))
    
    print_corr_matrix(clfs, X_test)
    
    return preds

In [None]:
preds = averaging(scaled_train, train_target.values, np.array(test_data))

In [None]:
rf_data = get_features_rf(scaled_train, train_target.values, RandomForestClassifier(n_jobs=15))

In [None]:
preds = averaging(scaled_train, train_target.values, np.array(test_data))

In [None]:
preds = averaging(rf_data, train_target.values, np.array(test_data))

# Keras

In [198]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn.cross_validation import KFold
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU

In [199]:

def build_model(input_dim, output_dim):
    model = Sequential()
    #model.add(Dropout(0.08, input_shape=(input_dim,)))
    model.add(Dense(1024, W_regularizer='l1', activation='sigmoid', input_dim=input_dim))
    model.add(BatchNormalization(gamma_regularizer='l1'))
    model.add(Dropout(0.1))
    
    model.add(Dense(512, W_regularizer='l1', b_regularizer='l2', input_dim=input_dim))
    model.add(PReLU())
    model.add(BatchNormalization(gamma_regularizer='l2'))
    model.add(Dropout(0.15))

    model.add(Dense(256))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(256, W_regularizer='l1', activation='sigmoid', input_dim=input_dim))
    model.add(BatchNormalization(gamma_regularizer='l1'))
    model.add(Dropout(0.1))
    
    model.add(Dense(128, W_regularizer='l1', b_regularizer='l2', input_dim=input_dim))
    model.add(PReLU())
    model.add(BatchNormalization(gamma_regularizer='l2'))
    model.add(Dropout(0.15))

    model.add(Dense(128))
    model.add(keras.layers.advanced_activations.PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(output_dim, activation='sigmoid'))
    model.compile(class_mode = 'binary', loss='binary_crossentropy', optimizer="adadelta")
    return model

def keras_cv(X, Y):
    
    input_dim = X.shape[1]
    output_dim = 2
    nb_folds = 4
    kfolds = StratifiedKFold(Y, nb_folds)
    #KFold(len(train_target.values), nb_folds)
    
    
    #kfolds = StratifiedShuffleSplit(Y,nb test_size=0.2, random_state=1234)
    av_roc = 0.
    f = 0
    
    X = np.array(X)
    y = np.array(Y)

    Y = np_utils.to_categorical(y)
    
    for train, valid in kfolds:
        print('---'*20)
        print('Fold', f)
        print('---'*20)
        f += 1
        X_train = X[train]
        X_valid = X[valid]
        Y_train = Y[train]
        Y_valid = Y[valid]
        y_valid = y[valid]

        print("Building model...")
        model = build_model(input_dim, output_dim)

        print("Training model...")

        model.fit(X_train, Y_train, nb_epoch=50, batch_size=32, validation_data=(X_valid, Y_valid), verbose=1)
        valid_preds = model.predict_proba(X_valid, verbose=0)
        valid_preds = valid_preds[:, 1]
        roc = roc_auc_score(y_valid, valid_preds)
        print("ROC:", roc)
        av_roc += roc

    print('Average ROC:', av_roc/nb_folds)
    
def keras_test(data, target):

    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, 
                                                        random_state=seed, stratify=target)
    
    y_train = np_utils.to_categorical(y_train)
    input_dim = data.shape[1]
    output_dim = 2

    #y_test = np_utils.to_categorical(y_test)


    print("Building model...")
    model = build_model(input_dim, output_dim)

    print("Training model...")
    model.fit(X_train, y_train, nb_epoch=150, batch_size=64, verbose=0)
    
    preds = model.predict_proba(X_test, verbose=1)
    print("\nAUC-ROC on test:", roc_auc_score(y_test, preds[:, 1]))
    
    return model
    

def keras_gs(data, target):
    
    data, X_test, target, y_test = train_test_split(data, target, test_size=0.1, 
                                                        random_state=seed, stratify=target)
    
    target = np_utils.to_categorical(target)
    input_dim = data.shape[1]
    output_dim = 2
    
    k_model = build_model(input_dim, output_dim)
    
    # define the grid search parameters
    batch_size = [10, 20, 40, 60, 80, 100]
    epochs = [10, 50, 100]
    param_grid = dict(batch_size=batch_size, nb_epoch=epochs)

    
    
    grid = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1, n_jobs=-1)
    grid.fit(data, target)
    
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        


In [None]:
my_model = keras_test(np.array(scaled_train), np.array(train_target).ravel())

Building model...




Training model...


In [195]:
X_train, X_test, y_train, y_test = train_test_split(np.array(scaled_train), train_target.values, test_size=0.2, 
                                                        random_state=seed, stratify=train_target.values)

gbm1 = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15, seed = 34123)

gbm2 = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.7,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15, seed = 985456)

gbm3 = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.6,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15, seed = 256542)

    


gbm1.fit(X_train, y_train)
gbm2.fit(X_train, y_train)
gbm3.fit(X_train, y_train)


gbm1_preds = gbm1.predict_proba(X_test)[:, 1]
gbm2_preds = gbm2.predict_proba(X_test)[:, 1]
gbm3_preds = gbm3.predict_proba(X_test)[:, 1]

keras_preds = my_model.predict_proba(X_test)[:, 1]


for i in [(0.3, 0.2, 0.2, 0.3), (0.1, 0.3, 0.3, 0.3), (0.25, 0.25, 0.25, 0.25)]:
    res = i[0]*keras_preds + i[1]*gbm1_preds + i[2]*gbm2_preds +i[3]*gbm3_preds 
    print("Score weighted (gmean):\t", roc_auc_score(y_test, res))
    print("coefs:", i)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


coefs: (0.3, 0.2, 0.2, 0.3)
Score weighted (gmean):	 0.94862055309
coefs: (0.1, 0.3, 0.3, 0.3)
Score weighted (gmean):	 0.948456629843
coefs: (0.25, 0.25, 0.25, 0.25)


In [186]:
gbm = xgb.XGBClassifier(max_depth=3, learning_rate=0.02, n_estimators=1200,
                                            min_child_weight=1, gamma=0.0,subsample=0.8,
                                            colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0.05,
                                            scale_pos_weight=1, max_delta_step=9, nthread=15)
    

calibrated_clf = CalibratedClassifierCV(gbm, method='sigmoid', cv=10)
calibrated_clf.fit(X_train, y_train)
y_preds = calibrated_clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, y_preds[:, 1], pos_label=1)
    print("AUC-ROC on test: %.4g" % auc(fpr, tpr))
    
    skf = StratifiedKFold(target, n_folds=5, random_state=seed)
    
    results = cross_val_score(calibrated_clf, data, target, cv=skf, scoring='roc_auc')
    for ind, i in enumerate(results):
        print("fold #{:d}: {:f}".format(ind, i))
        
    print("mean AUC-ROC on cv : %.4f%% (%.4f%%)" % (results.mean()*100, results.std()*100))

array([ 0.00010628,  0.01192422,  0.00023275, ...,  0.0047295 ,
        0.01096976,  0.00846082], dtype=float32)

In [104]:
keras_cv(np.array(scaled_train), np.array(train_target).ravel())

Building model...




Training model...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epo

In [86]:
keras_gs(np.array(scaled_train), np.array(train_target.values))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


TypeError: cannot deepcopy this pattern object

In [89]:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, nb_epoch=epochs)

In [90]:
param_grid

{'batch_size': [10, 20, 40, 60, 80, 100], 'nb_epoch': [10, 50, 100]}