In [1]:
# imports and configs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import normalize
from sklearn.grid_search import GridSearchCV

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
%matplotlib inline
plt.style.use('ggplot')

random_state = 42
normalize_data = True




In [2]:
def grid_search_clf(clf_to_train, param_grid, X_train, y_train, show=False):
    clf = GridSearchCV(estimator=clf_to_train, param_grid=param_grid,
                   n_jobs=-1, scoring='log_loss', verbose=True)
    clf.fit(X_train, y_train)
    if show:
        print('best score: {0}'.format(clf.best_score_))
        print('best estimator:')
        print(clf.best_estimator_)
    return clf

def clf_score(clf, X_test, y_test):
    clf_probs = clf.predict_proba(X_test)
    return log_loss(y_test, clf_probs)

def get_ans(clf, X, y, X_real, index_col ,transform=False):
    clf.fit(X, y)
    tmp_pred = clf.predict_proba(X_real)
    pred = [it[1] for it in tmp_pred]
    ans = index_col.copy()
    ans['probability'] = pred
    return ans

def normalize_df(df, cols_to_norm):
    return df[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))

In [3]:
# read data
train_file = './../data/nai_train_3.csv'
test_file = './../data/nai_test_3.csv'

data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
X = data.drop(['target'], axis=1)
y = data['target']

X_real = test_data.drop(['t_id'], axis=1)
id_real = pd.DataFrame(test_data['t_id'].copy())

if normalize_data:
    X = normalize_df(X, X.columns)
    X_real = normalize_df(X_real, X_real.columns)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=random_state)

In [16]:
# Log Regression on scaled data
clf_log_reg = LogisticRegression(random_state=random_state, n_jobs=-1, max_iter=1000)

X = normalize_df(X, X.columns)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=random_state)

X_real = test_data.drop(['t_id'], axis=1)
X_real = normalize_df(X_real, X_real.columns)
id_real = pd.DataFrame(test_data['t_id'].copy())


param_grid = {
    'penalty' : ['l2'], # 'l1'
    'C': np.linspace(0.0001, 100.0, 50),
    'tol': [0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'sag']
}

clf = grid_search_clf(clf_log_reg, param_grid, X_train, y_train, show=True)
print('log_loss: {0}'.format(clf_score(clf, X_test, y_test)))

print "Done with log reg"
print "\n\n==========================\n\n"

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 4050 out of 4050 | elapsed: 10.9min finished


Fitting 3 folds for each of 1350 candidates, totalling 4050 fits
best score: -0.691816564743
best estimator:
LogisticRegression(C=2.0409142857142859, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='sag', tol=0.01, verbose=0, warm_start=False)
log_loss: 0.691449473179
Done with log reg






In [18]:
# 0.69119
clf = LogisticRegression(C=5.1020897959183671, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='sag', tol=0.1, verbose=0, warm_start=False)

X = normalize_df(X, X.columns)
X_real = test_data.drop(['t_id'], axis=1)
X_real = normalize_df(X_real, X_real.columns)
id_real = pd.DataFrame(test_data['t_id'].copy())

ans = get_ans(clf, X, y, X_real, id_real)
ans.to_csv('test_ans.csv', index=False)

ValueError: X has 21 features per sample; expecting 42

In [14]:
## Sorted with swup

In [22]:
def get_sorted_data_with_swap(data, feature):
    sorted_data = data.sort(columns=feature).copy()
    X = sorted_data.drop('target', axis=1) if 'target' in sorted_data.columns else sorted_data
    y = []
    rows = X.shape[0]
    y = sorted_data['target'] if 'target' in data else []
    for col in X.columns:
        if col == 't_id':
            continue
        feature_swapped = [0. for i in range(rows)] # np.zeros() дает друго результат о_0... хотя отличий быть не должно
        for i in range(rows):
            feature_swapped[i] = (X[col][(i - 1) % rows] + X[col][(i - 2) % rows]) / 2.
        X['prev_' + col] = feature_swapped
    return X, y

In [23]:
X, y = get_sorted_data_with_swap(data, ['feature1'])
X = normalize_df(X, X.columns)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=random_state)

  from IPython.kernel.zmq import kernelapp as app


In [25]:
clf_log_reg = LogisticRegression(random_state=random_state, n_jobs=-1, max_iter=1000)

param_grid = {
    'penalty' : ['l2'], # 'l1'
    'C': np.linspace(0.0001, 100.0, 50),
    'tol': [0.00001, 0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'sag']
}

clf = grid_search_clf(clf_log_reg, param_grid, X_train, y_train, show=True)
print('log_loss: {0}'.format(clf_score(clf, X_test, y_test)))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 4050 out of 4050 | elapsed: 15.5min finished


Fitting 3 folds for each of 1350 candidates, totalling 4050 fits
best score: -0.691956340095
best estimator:
LogisticRegression(C=2.0409142857142859, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='lbfgs', tol=100, verbose=0, warm_start=False)
log_loss: 0.691667414946


In [None]:
# feature 1: 0.691270907607 - the best
# feature 1 with 2 prev: 0.691667414946  - bad
# feature 6: 0.691390394556 - good
# 19 and 6: log_loss: 0.691607982969 - bad
# feature 19: 0.691901755334 - very bad
# 6 and 19: 0.692073637757 - very very bad
