In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

  chunks = self.iterencode(o, _one_shot=True)


In [3]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

  chunks = self.iterencode(o, _one_shot=True)


## Default 

In [4]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")
X, y = data.loc[:, data.columns != "target"], data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

  chunks = self.iterencode(o, _one_shot=True)


In [5]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.271380686266


## One-Hot Encoding 

#### Load data 

In [6]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")

In [7]:
X, y = data.loc[:, data.columns != "target"], data.target

#### Preprocess 

In [8]:
for col in X.columns[X.columns.str.endswith("cat")]:
    X.loc[:, col] = X.loc[:, col].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
X = pd.get_dummies(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)



#### Without Undersampling 

In [11]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.271336978717


In [12]:
model = GradientBoostingClassifier()
model.fit(X_train.fillna(X_train.median()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_test.median()))[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.273417686925


#### With Undersampling 

In [13]:
%%time
resampler = RandomUnderSampler()
indexes = X_train.index.values.reshape(-1, 1)
index_res, y_res = resampler.fit_sample(indexes, y_train)
X_res = X_train.loc[index_res.flatten(), :]

CPU times: user 215 ms, sys: 13.3 ms, total: 228 ms
Wall time: 231 ms


In [14]:
model = XGBClassifier()
model.fit(X_res, y_res)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.268508048273


## Target Encoder 

In [15]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k, f):
        self.k = k
        self.f = f
        
    
    def _smoothing(self, n, k, f):
        return 1 / (1+np.exp(-(n-k)/f))
    
    def fit(self, X, y):
        self.prior = y.mean()
        self.encoding_dicts = {}
        for col in X.columns:
            mean_col = y.groupby(X[col]).mean()
            counts = X[col].value_counts()
            s = counts.apply(lambda n: self._smoothing(n, self.k, self.f))
            encoding = s * mean_col + (1-s) * self.prior
            self.encoding_dicts[col] = encoding.to_dict()
        return self
    
    def transform(self, X):
        X_enc = X.copy()
        for col in X.columns:
            X_enc[col] = X[col].apply(lambda n: self.encoding_dicts[col].get(n, self.prior))
        return X_enc

In [16]:
data = pd.read_csv("../data/train.csv", na_values=[-1, -1.0], index_col="id")
X, y = data.loc[:, data.columns != "target"], data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)



In [17]:
target_encoder = TargetEncoder(1, 1)
cat_cols = X_train.columns[X_train.columns.str.endswith("cat")]
X_train.loc[:, cat_cols] = target_encoder.fit_transform(X_train.loc[:, cat_cols], y)
X_test.loc[:, cat_cols] = target_encoder.transform(X_test.loc[:, cat_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


#### Without Undersampling 

In [18]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.272827229709


In [19]:
model = GradientBoostingClassifier()
model.fit(X_train.fillna(X_train.median()), y_train)
y_pred = model.predict_proba(X_test.fillna(X_test.median()))[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.273845739752


#### With undersampling 

In [20]:
%%time
resampler = RandomUnderSampler()
indexes = X_train.index.values.reshape(-1, 1)
index_res, y_res = resampler.fit_sample(indexes, y_train)
X_res = X_train.loc[index_res.flatten(), :]

CPU times: user 176 ms, sys: 6.68 ms, total: 183 ms
Wall time: 181 ms


In [24]:
model = XGBClassifier()
model.fit(X_res, y_res)
y_pred = model.predict_proba(X_test)[:, 1]

mean_gini = gini_normalized(y_test, y_pred)
print(mean_gini)

0.269984222218
