In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict
import warnings

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.feature_extraction import DictVectorizer

### Загрузка данных

In [2]:
# читаем csv
churn_data_df = pd.read_csv('orange_small_churn_data.txt', sep=',')
labels_df = pd.read_csv('orange_small_churn_labels.txt', sep=',', names=['label'])
labels_df['target'] = labels_df.label == 1
labels_df = labels_df.drop(['label'], axis=1)
print churn_data_df.shape

(40000, 230)


In [3]:
# готовим 
columns = churn_data_df.columns
columns_numbers = churn_data_df.columns[:190]
columns_cat = churn_data_df.columns[-40:]

# удаляем признаки не содержащине данных 
empty_columns = []
for col_name in columns:
    if len(churn_data_df[col_name].value_counts()) == 0:
       empty_columns.append(col_name) 

columns = [x for x in columns if x not in empty_columns]    
columns_numbers = [x for x in columns_numbers if x not in empty_columns]
columns_cat = [x for x in columns_cat if x not in empty_columns]

data = churn_data_df[columns]
data_numbers = churn_data_df[columns_numbers]
data_cat = churn_data_df[columns_cat]
target = labels_df.target

# Hold out выборка 
X, X_hold_out, y, y_hold_out = train_test_split(data, target, test_size=0.1, random_state=42)
print X_hold_out.shape
print y_hold_out.shape
print X.shape
print y.shape

(4000, 212)
(4000L,)
(36000, 212)
(36000L,)


In [4]:
# категориальные признаки 

columns_with_uniq_values = {}
for col_name in columns_cat:
    columns_with_uniq_values[col_name] = (
        len(data_cat[col_name].value_counts()), 
        (data_cat[col_name].notnull()).sum(),
        str(list(data_cat[col_name].value_counts()[:5])))
    
ordered = OrderedDict(sorted(columns_with_uniq_values.items(), key=lambda t: t[1][0]))
display(ordered)

# для признаков с малым кол-вом уникальных значений (n <= 30) будем использовать One Hot Encoding
columns_cat_for_one_hot_encoding = [k for k, v in columns_with_uniq_values.iteritems() if v[0] <= 50] 

# для признаков с большим кол-вом уникальных значений (n > 30) будем использовать специальное преобразование 
columns_cat_for_trick = [k for k, v in columns_with_uniq_values.iteritems() if v[0] > 50]

OrderedDict([('Var224', (1, 662, '[662L]')),
             ('Var215', (1, 563, '[563L]')),
             ('Var191', (1, 871, '[871L]')),
             ('Var213', (1, 890, '[890L]')),
             ('Var208', (2, 39877, '[36823L, 3054L]')),
             ('Var201', (2, 10190, '[10184L, 6L]')),
             ('Var218', (2, 39440, '[20253L, 19187L]')),
             ('Var211', (2, 40000, '[32215L, 7785L]')),
             ('Var225', (3, 19065, '[8875L, 8289L, 1901L]')),
             ('Var205', (3, 38453, '[25612L, 9232L, 3609L]')),
             ('Var194', (3, 10190, '[10015L, 143L, 32L]')),
             ('Var223', (4, 35804, '[29279L, 4780L, 1619L, 126L]')),
             ('Var229', (4, 17223, '[9312L, 7850L, 31L, 30L]')),
             ('Var196', (4, 40000, '[39633L, 351L, 15L, 1L]')),
             ('Var203', (5, 39877, '[36192L, 2529L, 1153L, 2L, 1L]')),
             ('Var210', (6, 40000, '[38084L, 1206L, 395L, 139L, 121L]')),
             ('Var227', (7, 40000, '[28112L, 4928L, 2724L, 1896L, 1818

### Подготовка признаков

In [5]:
# вещественные признаки, заполняем нулями пропуски
X_num_with_fill_zeros = X[columns_numbers].fillna(0)

In [6]:
# Категориальные признаки - One Hot Encoding
vectorizer = DictVectorizer(sparse=False)
X_cat_with_one_hot_encoding = vectorizer.fit_transform(X[columns_cat_for_one_hot_encoding].fillna('no_data').to_dict('records'))

### baseline 

In [7]:
# классификаторы
CLASSIFIERS = {
    'ridge':RidgeClassifier(), 
    'random forest': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier(),
}

In [8]:
def get_score(X, y):
    
    scoring = {'AUC': 'roc_auc', 'precision': 'average_precision'}
    
    for clf_name, clf in CLASSIFIERS.iteritems():
        score = cross_validate(clf, X, y, cv=5, scoring=scoring)
        print "{}:".format(clf_name)
        print " - roc_auc: {:.4f}".format(score['test_AUC'].mean())
        print " - average precision: {:.4f}".format(score['test_precision'].mean())
        print

In [11]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
get_score(np.hstack((X_num_with_fill_zeros, X_cat_with_one_hot_encoding)), y)

XGBClassifier:
 - roc_auc: 0.7378
 - average precision: 0.2165

ridge:
 - roc_auc: 0.6680
 - average precision: 0.1538

random forest:
 - roc_auc: 0.5880
 - average precision: 0.1003

