In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train.csv')

In [3]:
def dataframe_preprocess(df):
    df_float = df.select_dtypes(include=[np.float])
    df_object = df.select_dtypes(include=['object'])
    used_cat_cols = []
    #Выберем те признаки, в которых категорий не большое количество
    for col in df_object.columns:
        if len(df_object[col].unique()) <= 50:
            used_cat_cols.append(col) 

    df = df[list(df_float.columns) + used_cat_cols + ['label']]
    
    #Заменим NaN значения нулями для численных, а категориальные NaN заменим еще одной категорией, которая будет обозначать, 
    #что данных нет
    df[list(df_float.columns)] = df[list(df_float.columns)].fillna(0)
    df[used_cat_cols] = df[used_cat_cols].fillna('Not given')
    df = pd.get_dummies(df, columns = list(used_cat_cols))
    
    label = df.pop('label')
    df['label']=label
    return df

new_df = dataframe_preprocess(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [7]:
#В качестве бейслайн модлей выбрем линейную модель и два ансамблевыех метода со дефолтными параметрами
models = [LogisticRegression(random_state = 42), RandomForestClassifier(random_state = 42), 
          GradientBoostingClassifier(random_state = 42)]
model_names = ['Logistic Regression Scores', 'Random Forest Scores', 'Gradient Boosting Scores']

skf = StratifiedKFold(n_splits=10)

scoring = {'recall': 'recall_micro',
          'auc':'roc_auc',
          'f_score':'f1_micro',
           'prec':'precision_micro',
          'acc':'accuracy'}

for model, model_name in zip(models, model_names):
    print model_name
    scores = cross_validate(model, new_df.iloc[:,:-1], new_df['label'], scoring=scoring,
                             cv=skf, return_train_score = True)
    print 'Train Scores:'
    print 'Recall Score = {0}'.format(scores['train_recall'].mean())
    print 'AUC-ROC Score = {0}'.format(scores['train_auc'].mean())
    print 'F1 Score = {0}'.format(scores['train_f_score'].mean())
    print 'Precison Score = {0}'.format(scores['train_prec'].mean())
    print 'Accuracy Score = {0}'.format(scores['train_acc'].mean())
    print '\n'
    print 'Test Scores:'
    print 'Recall Score = {0}'.format(scores['test_recall'].mean())
    print 'AUC-ROC Score = {0}'.format(scores['test_auc'].mean())
    print 'F1 Score = {0}'.format(scores['test_f_score'].mean())
    print 'Precison Score = {0}'.format(scores['test_prec'].mean())
    print 'Accuracy Score = {0}'.format(scores['test_acc'].mean())
    print '-------------------------------------'

Logistic Regression Scores
Train Scores:
Recall Score = 0.917763890719
AUC-ROC Score = 0.549447008382
F1 Score = 0.917763890719
Precison Score = 0.917763890719
Accuracy Score = 0.917763890719


Test Scores:
Recall Score = 0.9175000896
AUC-ROC Score = 0.543352848328
F1 Score = 0.9175000896
Precison Score = 0.9175000896
Accuracy Score = 0.9175000896
-------------------------------------
Random Forest Scores
Train Scores:
Recall Score = 0.985093742869
AUC-ROC Score = 0.999808311498
F1 Score = 0.985093742869
Precison Score = 0.985093742869
Accuracy Score = 0.985093742869


Test Scores:
Recall Score = 0.926156350211
AUC-ROC Score = 0.597830715571
F1 Score = 0.926156350211
Precison Score = 0.926156350211
Accuracy Score = 0.926156350211
-------------------------------------
Gradient Boosting Scores
Train Scores:
Recall Score = 0.928260415495
AUC-ROC Score = 0.792091489219
F1 Score = 0.928260415495
Precison Score = 0.928260415495
Accuracy Score = 0.928260415495


Test Scores:
Recall Score = 0.