In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import  make_scorer
from skopt import BayesSearchCV, gp_minimize
from xgboost import XGBClassifier

  chunks = self.iterencode(o, _one_shot=True)


In [4]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

gini_scorer = make_scorer(gini_normalized)

  chunks = self.iterencode(o, _one_shot=True)


### Forza Features 

In [5]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")
X, y = data.loc[:, data.columns != "target"], data.target

  chunks = self.iterencode(o, _one_shot=True)


##### Get dummies 

In [8]:
for col in X.columns[X.columns.str.endswith("cat")]:
    X.loc[:, col] = X.loc[:, col].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  chunks = self.iterencode(o, _one_shot=True)


In [9]:
X = pd.get_dummies(X)

  chunks = self.iterencode(o, _one_shot=True)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

  chunks = self.iterencode(o, _one_shot=True)


##### Drop "calc" features 

In [11]:
calc_cols = X_train.columns[X_train.columns.str.contains("calc")]
X_train = X_train.drop(calc_cols, axis=1)
X_test = X_test.drop(calc_cols, axis=1)

  chunks = self.iterencode(o, _one_shot=True)


##### Median and mean features 

In [12]:
float_cols = X_train.columns[(~X_train.columns.str.contains("cat")) & (~X_train.columns.str.contains("bin"))]
X_train[float_cols + "_median"] = (X_train[float_cols] > X_train[float_cols].median()).astype("float")
X_test[float_cols + "_median"] = (X_test[float_cols] > X_test[float_cols].median()).astype("float")
X_train[float_cols + "_mean"] = (X_train[float_cols] > X_train[float_cols].mean()).astype("float")
X_test[float_cols + "_mean"] = (X_test[float_cols] > X_test[float_cols].mean()).astype("float")

  chunks = self.iterencode(o, _one_shot=True)


##### Model 

In [13]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.272107405284


In [14]:
model = GradientBoostingClassifier()
model.fit(X_train.fillna(X_train.median()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_test.median()))[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.273976343185


In [15]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.238423276441


#### Optimize 

In [12]:
 def objective(params):
    max_depth, learning_rate, n_estimators, gamma, min_child_weight, reg_alpha, reg_lambda = params
    model.set_params(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
    return -np.mean(cross_val_score(model, X_train, y_train,n_jobs=-1, scoring=gini_scorer,cv=5))

In [13]:
param_space=[(3,15),
             (0.001,0.3),
             (60,200),
             (0,0.4),
             (1,50),
             (.1,1),
             (0,1)]

In [14]:
model = XGBClassifier(seed=42)

In [15]:
res_gp = gp_minimize(objective, param_space, n_calls=100, random_state=42)



In [16]:
max_depth, learning_rate, n_estimators, gamma, min_child_weight, reg_alpha, reg_lambda = res_gp.x

#### Use best parameters 

In [17]:
print("Using best parameters")

Using best parameters


In [19]:
model = XGBClassifier(seed=42,max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

-0.00220719943368


In [22]:
estimator = XGBClassifier(seed=42)
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'gamma': [0, 0.1],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}
model = BayesSearchCV(estimator, search_spaces=param_dist, scoring=gini_scorer, cv=5, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.0min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.1min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.0min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.9min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  7.4min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.4min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.9min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  7.8min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.6min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.3min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.3min finished


0.145423147075


### Forza Features with Target Encoding 

In [7]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k, f):
        self.k = k
        self.f = f      
    
    def _smoothing(self, n, k, f):
        return 1 / (1+np.exp(-(n-k)/f))
    
    def fit(self, X, y):
        self.prior = y.mean()
        self.encoding_dicts = {}
        for col in X.columns:
            mean_col = y.groupby(X[col]).mean()
            counts = X[col].value_counts()
            s = counts.apply(lambda n: self._smoothing(n, self.k, self.f))
            encoding = s * mean_col + (1-s) * self.prior
            self.encoding_dicts[col] = encoding.to_dict()
        return self
    
    def transform(self, X):
        X_enc = X.copy()
        for col in X.columns:
            X_enc[col] = X[col].apply(lambda n: self.encoding_dicts[col].get(n, self.prior))
        return X_enc

  chunks = self.iterencode(o, _one_shot=True)


In [5]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")
X, y = data.loc[:, data.columns != "target"], data.target

  chunks = self.iterencode(o, _one_shot=True)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

  chunks = self.iterencode(o, _one_shot=True)


##### Drop "calc" features 

In [11]:
calc_cols = X_train.columns[X_train.columns.str.contains("calc")]
X_train = X_train.drop(calc_cols, axis=1)
X_test = X_test.drop(calc_cols, axis=1)

  chunks = self.iterencode(o, _one_shot=True)


##### Median and mean features 

In [12]:
float_cols = X_train.columns[(~X_train.columns.str.contains("cat")) & (~X_train.columns.str.contains("bin"))]
X_train[float_cols + "_median"] = (X_train[float_cols] > X_train[float_cols].median()).astype("float")
X_test[float_cols + "_median"] = (X_test[float_cols] > X_test[float_cols].median()).astype("float")
X_train[float_cols + "_mean"] = (X_train[float_cols] > X_train[float_cols].mean()).astype("float")
X_test[float_cols + "_mean"] = (X_test[float_cols] > X_test[float_cols].mean()).astype("float")

  chunks = self.iterencode(o, _one_shot=True)


##### Target Encoding 

In [8]:
encoder = TargetEncoder(1, 1)
cat_cols = X_train.columns[X_train.columns.str.contains("cat")]
X_train = encoder.fit_transform(X_train[cat_cols], y_train)
X_test = encoder.transform(X_test[cat_cols])

  chunks = self.iterencode(o, _one_shot=True)


##### Without optimizations 

In [9]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.222162934436


In [10]:
model = GradientBoostingClassifier()
model.fit(X_train.fillna(X_train.median()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_test.median()))[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.22239898593


In [11]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.203308943491


#### Optimize 

In [12]:
 def objective(params):
    max_depth, learning_rate, n_estimators, gamma, min_child_weight, reg_alpha, reg_lambda = params
    model.set_params(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
    return -np.mean(cross_val_score(model, X_train, y_train,n_jobs=-1, scoring=gini_scorer,cv=5))

In [13]:
param_space=[(3,15),
             (0.001,0.3),
             (60,200),
             (0,0.4),
             (1,50),
             (.1,1),
             (0,1)]

In [14]:
model = XGBClassifier(seed=42)

In [15]:
res_gp = gp_minimize(objective, param_space, n_calls=100, random_state=42)



In [16]:
max_depth, learning_rate, n_estimators, gamma, min_child_weight, reg_alpha, reg_lambda = res_gp.x

#### Use best parameters 

In [17]:
print("Using best parameters")

Using best parameters


In [19]:
model = XGBClassifier(seed=42,max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

-0.00220719943368


In [22]:
estimator = XGBClassifier(seed=42)
param_dist = {
    'max_depth': [3,5,8,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100,150],
    'gamma': [0, 0.1],
    'min_child_weight': [1,10,20,40],
    'reg_alpha': [0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}
model = BayesSearchCV(estimator, search_spaces=param_dist, scoring=gini_scorer, cv=5, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.0min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.1min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.0min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.9min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  7.4min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  9.4min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.9min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  7.8min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.6min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.3min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.2min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.3min finished


0.145423147075
