In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
x = pd.read_csv('data/x_train.csv.gz', delimiter=';')
y = np.ravel(pd.read_csv('data/y_train.csv.gz', names=['target']))
test = pd.read_csv('data/x_test.csv.gz', delimiter=';')

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin

class ModelClassTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict_proba(X))

In [61]:
final_estimator = ('final', LogisticRegression())

xgb_transformer = ('xgb', ModelClassTransformer(xgb.XGBClassifier(seed=42)))

lr_transformer = ('lr', Pipeline([
            ('scale', StandardScaler()),
            ('lr', ModelClassTransformer(LogisticRegression(random_state=42)))
        ]))

estimators = ('estimators', FeatureUnion([
            ('xgb1', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=42))),
            ('xgb2', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=4242))),
            ('xgb3', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=424242))),
            ('xgb4', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=42424242))),
            ('xgb5', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=421))),
            ('xgb6', ModelClassTransformer(xgb.XGBClassifier(n_estimators=300,seed=4211))),
            lr_transformer
        ]))

pipeline = Pipeline([
        estimators,
        final_estimator
    ])

In [62]:
scores = cross_val_score(pipeline, x, y, scoring='neg_log_loss')
print("LogLoss: {} (+/- {})".format(scores.mean(), scores.std() * 2))

LogLoss: -0.408164990598511 (+/- 0.007883922074138225)


In [56]:
params = {'estimators__lr__lr__model__C': [0.01, 0.1, 0.5, 1, 10]}
search = GridSearchCV(pipeline, params, cv=10, scoring='neg_log_loss')
search.fit(x, y)
print(search.best_params_)
print(search.best_score_)

{'estimators__lr__lr__model__C': 0.01}
-0.390577447413


In [63]:
pipeline.fit(x, y)
pred = pipeline.predict_proba(test)[:,1]
submission = pd.DataFrame()
submission['target'] = pred
submission.to_csv("submissions/5_pipeline.csv", index=False, header=False)