In [1]:
import pandas as pd
import numpy as np

from scipy import stats as ss
from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import train_test_split

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.decomposition import PCA

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import average_precision_score, precision_recall_fscore_support, roc_auc_score

In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from collections import Counter, defaultdict

## 1. Функция обработки данных

In [19]:
# Функция для преобразования данных
# num_cols - числовые признаки, которые будут использоваться
# cat_cols - категориальные признаки, которые будут использоваться
# опции: is_PCA - применение метода главныых компонент, is_OHE - One Hot Encoding, 
# is_OverSampling - Over Sampling
# По умолчанию используется все признаки, все опции выключены.
# Для обработки категориальных данных по умолчанию используется только Label Encoder

def transform_data(data, y=None, num_cols=None, cat_cols=None, is_PCA=False, is_OHE=False, is_OverSampling=False):
    
    to_return = {}
    
    num_interval = (0, 190)
    cat_interval = (191, data.shape[1])
    
    
    data, num_interval, cat_interval = drop_null_cols(data, num_interval, cat_interval)
    
    data_num, data_cat = split_num_cat(data, num_interval)
    
    if num_cols:
        data_num = data[num_cols]
    if cat_cols:
        data_cat = data[cat_cols]
    
    print('Processing the numeric columns...')
    data_num = data_num_processing(data_num)
    
    if is_PCA:
        data_num = data_num_PCA(data_num)
        
    print('Processing the categorical columns...')
    data_cat = data_cat_fill_nan(data_cat)
    
    data_cat = data_cat_LE(data_cat)
    
    if is_OHE:    
        data_cat = data_cat_OHE(data_cat)
        
    data_result = data_join(data_num, data_cat, is_OHE=is_OHE)
    
    if is_OverSampling:
        data_result, to_return['y'] = data_OverSampling(data_result, y)
    
    to_return['X'] = data_result
    to_return['num_interval'] = num_interval
    to_return['cat_interval'] = cat_interval
        
    print('Done!')
    
    return to_return

In [5]:
# Функция удаления пустых колонок

def drop_null_cols(data, num_interval, cat_interval):
    
    print('Dropping the null columns...')
    dropped_num = []
    dropped_cat = []
    
    for idx, each_col in enumerate(data.columns):
        if len(data[pd.isnull(data[each_col])==True])==data.shape[0]:
            if idx > 190:
                dropped_cat.append(each_col)
            else:
                dropped_num.append(each_col)
                
    for each_col in dropped_num:
        data.drop(each_col,axis = 1,inplace = True)
        print("Dropped num-type column: {0}".format(each_col))
    
    for each_col in dropped_cat:
        data.drop(each_col,axis = 1,inplace = True)
        print("Dropped cal-type column: {0}".format(each_col))
    
    max_num = 190 - len(dropped_num)
    
    print("Dropped {0} num-type cols and {1} cat-types cols".format(len(dropped_num), len(dropped_cat)))
    print("Num-type feature have the indices from {0} up to {1}.".format(0, max_num))
    print("Cal-type feature have the indices from {0} up to {1}.".format(max_num+1, data.shape[1]))
    
    
    return data, (0, max_num), (max_num+1, data.shape[1])

In [6]:
# Функция разделения числовых и категориальных признаков

def split_num_cat(data, num_interval):
    print('Splitting the numeric and categorical columns...')
    data_num, data_cat = np.split(data, [num_interval[1]], axis=1)
    return data_num, data_cat

In [7]:
# Функция обработки числовых данных: заполнение пустых значений матожиданием
# и нормировка по каждому из признаков

def data_num_processing(data_num):
    print('Filling the absent values with the columns\' means...')
    data_num_transformed = data_num.fillna(data_num.mean())
    
    print('Normalizing the data...')
    for each_col in data_num_transformed.columns:
        if data_num_transformed[each_col].std():
            data_num_transformed[each_col]=(data_num_transformed[each_col]-data_num_transformed[each_col].mean()) / float(data_num_transformed[each_col].std())
        else:
            data_num_transformed[each_col]=data_num_transformed[each_col]-data_num_transformed[each_col].mean()
    
    return data_num_transformed

In [8]:
# Применение метода главных компонент к числовым признакам

def data_num_PCA(data_num, n_components=20):
    print('Appling a PCA to the numerical columns...')
    pca = PCA(n_components=n_components)
    data_PCA = pca.fit_transform(data_num)
    return data_PCA

In [9]:
# Заполнение пустых ячеек категориальных признаков значением 'unknown'

def data_cat_fill_nan(data_cat):
    print('Filling the absent categorical values...')
    data_cat_trasformed = data_cat.fillna('unknown')
    return data_cat_trasformed

In [10]:
# Применение Label Encoder к категориальным данным

def data_cat_LE(data_cat):
    print('Applying a LabelEncoder to the categorical columns...')
    
    data_cat = data_cat.apply(lambda x: x.astype(str))
    data_cat_transformed = data_cat.apply(lambda x: x.astype('category'))
    
    MyLabelEncoder = defaultdict(LabelEncoder)
    data_cat_transformed = data_cat_transformed.apply(lambda x: MyLabelEncoder[x.name].fit_transform(x))
    
    return data_cat_transformed

In [11]:
# Применение One Hot Encoder к категориальным данным

def data_cat_OHE(data_cat):
    print('Applying a OneHotEncoder to categorical columns...')
    data_cat_transformed = OneHotEncoder().fit_transform(data_cat)
    return data_cat_transformed

In [12]:
# Соединение обработанных числовых и категориальных признаков после обработки
# указание is_OHE необходимо, поскольку после без One Hot Encoder data_cat - dataframe, 
# без него - csr_matrix

def data_join(data_num, data_cat, is_OHE):
    print('Joining the numerical and categorical columns...')
    if not is_OHE:
        data_result = pd.concat([data_num, data_cat], axis=1, join_axes=[data_num.index])
        data_result = csr_matrix(data_num)
    else:
        data_num_sparsed = csr_matrix(data_num)
        data_result = hstack((data_num_sparsed, data_cat))
    return data_result

In [13]:
# OverSampling
# Применимо только если есть значения labels

def data_OverSampling(data, labels):
    print('OverSampling in progress...')
    ros = RandomOverSampler(random_state=0)
    X_num_cat_resampled, y_resampled = ros.fit_sample(data, labels['target'].ravel())
    return X_num_cat_resampled, y_resampled

In [47]:
# Функция расчета метрик отложенной выборки

def get_scores_hold_out(X_hold_out, y_hold_out, Classifier, configuration, scores):
    
    print('Calculating scores...')
    
    y_proba = Classifier.predict_proba(X_hold_out)[:, 1]
    y_pred = Classifier.predict(X_hold_out)

    results = precision_recall_fscore_support(y_hold_out, y_pred)
    pr_scores = average_precision_score(y_hold_out, y_proba)
    roc_auc_scores = roc_auc_score(y_hold_out, y_proba)
    
    scores.loc[configuration, 'precision'] = results[0][1]
    scores.loc[configuration, 'recall'] = results[1][1]
    scores.loc[configuration, 'f1-score'] = results[2][1]
    
    scores.loc[configuration, 'PR-score'] = pr_scores
    scores.loc[configuration, 'ROC AUC'] = roc_auc_scores
    
    print('Done!')
    return True

In [28]:
# Функция расчета метрик обучающей выборки

def get_scores(X, y, Classifier, configuration, scores):
    
    skf = StratifiedKFold(n_splits=5, random_state=123)
    skf.get_n_splits(X, y)
    skf.split(X, y)
    
    print('Calculating scores...')
    results = cross_validate(Classifier, X, y, cv=skf, scoring=['precision', 'recall', 'f1', 'average_precision', 'roc_auc'], return_train_score=True)
    
    scores.loc[configuration, 'precision test'] = results['test_precision'].mean()
    scores.loc[configuration, 'precision train'] = results['train_precision'].mean()
    scores.loc[configuration, 'recall test'] = results['test_recall'].mean()
    scores.loc[configuration, 'recall train'] = results['train_recall'].mean()
    scores.loc[configuration, 'f1-score test'] = results['test_f1'].mean()
    scores.loc[configuration, 'f1-score train'] = results['train_f1'].mean()
    scores.loc[configuration, 'PR-score test'] = results['test_average_precision'].mean()
    scores.loc[configuration, 'PR-score train'] = results['train_average_precision'].mean()
    scores.loc[configuration, 'ROC AUC test'] = results['test_roc_auc'].mean()
    scores.loc[configuration, 'ROC AUC train'] = results['train_roc_auc'].mean()
    
    print('Done!')
    return True

## 2. Оптимизация обработки данных и классификатора

In [16]:
# Датафреймы с метриками различных конфигураций обработки данных и классификатора
# для обучающей и отложенной выборок

scores = pd.DataFrame(columns = ['precision test', 'precision train', 'recall test', 'recall train', 'f1-score test', 'f1-score train','PR-score test','PR-score train', 'ROC AUC test', 'ROC AUC train'])
scores_hold_out = pd.DataFrame(columns = ['precision', 'recall', 'f1-score', 'PR-score', 'ROC AUC'])

In [17]:
# Топ-20 числовых признаков по корреляции
num_top_20 = ['Var188',
 'Var130',
 'Var189',
 'Var114',
 'Var73',
 'Var111',
 'Var126',
 'Var7',
 'Var177',
 'Var168',
 'Var139',
 'Var53',
 'Var142',
 'Var92',
 'Var144',
 'Var147',
 'Var69',
 'Var136',
 'Var110',
 'Var51']

# Топ-20 категориальных признаков по корреляции
cat_top_20 = ['Var199',
 'Var192',
 'Var216',
 'Var206',
 'Var212',
 'Var205',
 'Var228',
 'Var193',
 'Var207',
 'Var227',
 'Var204',
 'Var221',
 'Var210',
 'Var218',
 'Var200',
 'Var214',
 'Var226',
 'Var197',
 'Var211',
 'Var225']

### 2.1 Label Enconding без отборка признаков

In [55]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [56]:
data_transformed = transform_data(data=data)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [57]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            labels, 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=labels)
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(labels.shape[0]))
print(y_hold_out.shape[0]/float(labels.shape[0]))

0.8
0.2
0.8
0.2


In [58]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_all_cols', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_all_cols', scores_hold_out)

Calculating scores...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Done!
Calculating scores...
Done!


True

In [64]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733


In [65]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504


### 2.2 Label Enconding с отбором топ-20 числовых признаков

In [66]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [67]:
data_transformed = transform_data(data=data, num_cols=num_top_20)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [68]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            labels, 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=labels)
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(labels.shape[0]))
print(y_hold_out.shape[0]/float(labels.shape[0]))

0.8
0.2
0.8
0.2


In [69]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_top_20_num', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_top_20_num', scores_hold_out)

Calculating scores...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Done!
Calculating scores...
Done!


True

In [70]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665


In [71]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015


* Отбор числовых признаков значительно улучшил точность; 
* Тем не менее, recall, f1-score и PR-score недопустимо малы, поэтому следует и дальше модифицировать алгоритм обработки данных.

### 2.3 Label Enconding и отбор топ-20 числовых и топ-20 категориальных признаков

In [72]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [73]:
data_transformed = transform_data(data=data, num_cols=num_top_20, cat_cols=cat_top_20)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [74]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            labels, 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=labels)
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(labels.shape[0]))
print(y_hold_out.shape[0]/float(labels.shape[0]))

0.8
0.2
0.8
0.2


In [75]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_top_20_num_top_20_cat', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_top_20_num_top_20_cat', scores_hold_out)

Calculating scores...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Done!
Calculating scores...
Done!


True

In [76]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665


In [77]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296


* Отбор категориальных признаков уменьших количество вычислений, незначительно изменив метрики;

* Тем не менее, recall, f1-score и PR-score недопустимо малы, поэтому следует и дальше модифицировать алгоритм обработки данных.

### 2.3 Label Encoding + One Hot Encoding + и отбор топ-20 числовых и топ-20 категориальных признаков¶

In [80]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [81]:
data_transformed = transform_data(data=data, num_cols=num_top_20, cat_cols=cat_top_20, is_OHE=True)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [82]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            labels, 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=labels)
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(labels.shape[0]))
print(y_hold_out.shape[0]/float(labels.shape[0]))

0.8
0.2
0.8
0.2


In [83]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_OHE_top_20_num_top_20_cat', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_OHE_top_20_num_top_20_cat', scores_hold_out)

Calculating scores...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Done!
Calculating scores...
Done!


True

In [84]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665
LE_OHE_top_20_num_top_20_cat,0.381602,0.929314,0.0084016,0.0140699,0.0164207,0.0277036,0.197827,0.367195,0.72685,0.815824


In [85]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296
LE_OHE_top_20_num_top_20_cat,0.428571,0.00504202,0.00996678,0.185482,0.726427


* Применение One Hot Encoding ухудшило точность, recall, f1-score	и улучшила интегральные показатели PR-score и ROC AUC;

* В дальнейшем следует пробовать комбинации как с One Hot Encoding, так и без него
* recall, f1-score и PR-score недопустимо малы, поэтому следует и дальше модифицировать алгоритм обработки данных.

### 2.4 Label Encoding и отбор топ-20 числовых и топ-20 категориальных признаков + Over Sampling

In [91]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [92]:
data_transformed = transform_data(data=data, y=labels, num_cols=num_top_20, cat_cols=cat_top_20, is_OverSampling=True)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [95]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            data_transformed['y'], 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=data_transformed['y'])
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(data_transformed['y'].shape[0]))
print(y_hold_out.shape[0]/float(data_transformed['y'].shape[0]))

0.7999945980985307
0.20000540190146932
0.7999945980985307
0.20000540190146932


In [96]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_top_20_num_top_20_cat_OverSampling', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_top_20_num_top_20_cat_OverSampling', scores_hold_out)

Calculating scores...
Done!
Calculating scores...
Done!


True

In [97]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665
LE_OHE_top_20_num_top_20_cat,0.381602,0.929314,0.0084016,0.0140699,0.0164207,0.0277036,0.197827,0.367195,0.72685,0.815824
LE_top_20_num_top_20_cat_OverSampling,0.655916,0.659534,0.714946,0.718736,0.684114,0.687837,0.719143,0.724094,0.739909,0.744958


In [98]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296
LE_OHE_top_20_num_top_20_cat,0.428571,0.00504202,0.00996678,0.185482,0.726427
LE_top_20_num_top_20_cat_OverSampling,0.663269,0.729372,0.694752,0.724446,0.746608


* Значительно улучшились все показатели, особенно recall, f1-score и PR-score, что важно, учитывая несбалансированный характер исходных данных;

### 2.5 Label Encoding + One Hot Encoding + отбор топ-20 числовых и топ-20 категориальных признаков + Over Sampling

In [99]:
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [100]:
data_transformed = transform_data(data=data, y=labels, num_cols=num_top_20, cat_cols=cat_top_20, 
                                  is_OverSampling=True, is_OHE=True)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [101]:
X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X'], 
                                                            data_transformed['y'], 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=data_transformed['y'])
print(X_train.shape[0]/float(data_transformed['X'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X'].shape[0]))
print(y_train.shape[0]/float(data_transformed['y'].shape[0]))
print(y_hold_out.shape[0]/float(data_transformed['y'].shape[0]))

0.7999945980985307
0.20000540190146932
0.7999945980985307
0.20000540190146932


In [102]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling', scores_hold_out)

Calculating scores...
Done!
Calculating scores...
Done!


True

In [103]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665
LE_OHE_top_20_num_top_20_cat,0.381602,0.929314,0.0084016,0.0140699,0.0164207,0.0277036,0.197827,0.367195,0.72685,0.815824
LE_top_20_num_top_20_cat_OverSampling,0.655916,0.659534,0.714946,0.718736,0.684114,0.687837,0.719143,0.724094,0.739909,0.744958
LE_OHE_top_20_num_top_20_cat_OverSampling,0.685121,0.690455,0.704615,0.71217,0.694729,0.701142,0.749707,0.759348,0.766268,0.774798


In [104]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296
LE_OHE_top_20_num_top_20_cat,0.428571,0.00504202,0.00996678,0.185482,0.726427
LE_top_20_num_top_20_cat_OverSampling,0.663269,0.729372,0.694752,0.724446,0.746608
LE_OHE_top_20_num_top_20_cat_OverSampling,0.692571,0.711276,0.701799,0.757314,0.772186


* Улучшились все показатели, как recall и f1-score, так и интергральные ROC AUC и PR-score

### 2.6 Настройка классификатора

In [105]:
# Уменьшение скорости обучения

gbc = GradientBoostingClassifier(learning_rate=0.05)
get_scores(X_train, y_train, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05', 
                    scores_hold_out)

Calculating scores...
Done!
Calculating scores...
Done!


True

In [106]:
# Уменьшение доли элементов, используемых в обучении, -> стохастический градиентный спуск и уменьшение дисперсии

gbc = GradientBoostingClassifier(subsample=0.5)
get_scores(X_train, y_train, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5', scores_hold_out)

Calculating scores...
Done!
Calculating scores...
Done!


True

In [107]:
# Комбинация уменьшения скорости обучения и доли элементов, используемых в обучении

gbc = GradientBoostingClassifier(learning_rate=0.05, subsample=0.5)
get_scores(X_train, y_train, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5', scores)
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5', 
                    scores_hold_out)

Calculating scores...
Done!
Calculating scores...
Done!


True

In [108]:
scores

Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665
LE_OHE_top_20_num_top_20_cat,0.381602,0.929314,0.0084016,0.0140699,0.0164207,0.0277036,0.197827,0.367195,0.72685,0.815824
LE_top_20_num_top_20_cat_OverSampling,0.655916,0.659534,0.714946,0.718736,0.684114,0.687837,0.719143,0.724094,0.739909,0.744958
LE_OHE_top_20_num_top_20_cat_OverSampling,0.685121,0.690455,0.704615,0.71217,0.694729,0.701142,0.749707,0.759348,0.766268,0.774798
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05,0.672929,0.67755,0.699551,0.704084,0.68598,0.69056,0.738607,0.745166,0.752393,0.757689
LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5,0.687044,0.692741,0.706709,0.710963,0.696735,0.701731,0.752413,0.761964,0.770079,0.778302
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5,0.673707,0.678013,0.697897,0.703476,0.685581,0.690505,0.740598,0.746502,0.753691,0.758713


In [109]:
scores_hold_out

Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296
LE_OHE_top_20_num_top_20_cat,0.428571,0.00504202,0.00996678,0.185482,0.726427
LE_top_20_num_top_20_cat_OverSampling,0.663269,0.729372,0.694752,0.724446,0.746608
LE_OHE_top_20_num_top_20_cat_OverSampling,0.692571,0.711276,0.701799,0.757314,0.772186
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05,0.683949,0.705469,0.694542,0.746759,0.759108
LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5,0.693808,0.715733,0.7046,0.758046,0.774817
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5,0.681724,0.707225,0.69424,0.746513,0.759469


* Настройка классификатора значительного улучшения метрик не дала при существенном увеличиении времени расчетов;
* Применение дополнительных настроек классификатора нецелесообразно

## 3 Применение наилучшего алгоритма к данным из kaggle

* Наилучшее сочетание качества метрик, количество вычислений и времени вычислений показала комбинация LE_OHE_top_20_num_top_20_cat_OverSampling, соответствующая Label Encoding + One Hot Encoding + отбор топ-20 числовых и топ-20 категориальных признаков + Over Sampling
* Указаннай алгоритм будет применен к данным kaggle

In [128]:
# Загрузка обучающей выборки
data = pd.read_csv('orange_small_churn_data.txt', header=0, sep=',')
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)
labels.columns = ['target']
labels = labels.apply(lambda x: x.astype(int))

In [129]:
# Загрузка данных kaggle
data_kaggle = pd.read_csv('orange_small_churn_test_data.csv', header=0, sep=',')

In [130]:
data_kaggle = data_kaggle.drop(['ID'], axis=1)
data_kaggle.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1225.0,7.0,,,,...,zCkv,APgdzOv,jySVZNlOJy,,ELof,xb3V,6fzt,Zy3gnGM,,
1,,,,,,896.0,14.0,,,,...,oslk,IIvC99a,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,
2,,,,,,791.0,7.0,,,,...,oslk,6YSocsg,LM8l689qOp,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,mj86,
3,,,,,,2296.0,7.0,,,,...,oslk,5nQ7A2G,jySVZNlOJy,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,am7c,
4,8.0,,,,,,,,28.0,,...,oslk,MI8s5nE,LM8l689qOp,,,7P5s,RAYp,F2FyR07IdsN7I,,


In [131]:
data_kaggle.shape

(10000, 230)

In [132]:
# Объединение обучающей выборки и данных kaggle

# Объединение необходимо, чтобы категориальные данные 
# были закодированы Label Encoding и One Hot Encoding единообразно
# и чтобы исключить наличие значений категориальных данных,
# которые отсутствуют в обучающей выборке и присутствуют в данных из kaggle

all_data = pd.concat([data, data_kaggle], ignore_index=True)

In [134]:
# is_OverSampling=False, поскольку для данных с kaggle нет labels
# Их, собственно, и нужно рассчитать

# Over Sampling будет применен далее отдельно к обучающей выборке

data_transformed = transform_data(data=all_data, num_cols=num_top_20, cat_cols=cat_top_20, is_OHE=True,
                                  is_OverSampling=False)

Dropping the null columns...
Dropped num-type column: Var8
Dropped num-type column: Var15
Dropped num-type column: Var20
Dropped num-type column: Var31
Dropped num-type column: Var32
Dropped num-type column: Var39
Dropped num-type column: Var42
Dropped num-type column: Var48
Dropped num-type column: Var52
Dropped num-type column: Var55
Dropped num-type column: Var79
Dropped num-type column: Var141
Dropped num-type column: Var167
Dropped num-type column: Var169
Dropped num-type column: Var175
Dropped num-type column: Var185
Dropped cal-type column: Var209
Dropped cal-type column: Var230
Dropped 16 num-type cols and 2 cat-types cols
Num-type feature have the indices from 0 up to 174.
Cal-type feature have the indices from 175 up to 212.
Splitting the numeric and categorical columns...
Processing the numeric columns...
Filling the absent values with the columns' means...
Normalizing the data...
Processing the categorical columns...
Filling the absent categorical values...
Applying a Label

In [135]:
data_transformed['X']

<50000x38884 sparse matrix of type '<class 'numpy.float64'>'
	with 1950702 stored elements in COOrdinate format>

In [136]:
# Данные обучающей выборки

data_transformed['X_train'] = data_transformed['X'].tocsr()[0:40000,]

In [137]:
# Данные kaggle

data_transformed['X_test'] = data_transformed['X'].tocsr()[40000:50000,]

In [138]:
# Применение data_OverSampling() к обучающей выборке

data_transformed['X_train_resampled'], data_transformed['y_resampled'] = data_OverSampling(data_transformed['X_train'], labels)

OverSampling in progress...


In [139]:
data_transformed['X_train_resampled']

<74048x38884 sparse matrix of type '<class 'numpy.float64'>'
	with 2888850 stored elements in Compressed Sparse Row format>

In [140]:
data_transformed['y_resampled']

array([-1, -1, -1, ...,  1,  1,  1])

In [142]:
# Выделение отложенной выборки

X_train, X_hold_out, y_train, y_hold_out = train_test_split(data_transformed['X_train_resampled'], 
                                                            data_transformed['y_resampled'], 
                                                            test_size=0.2, random_state=123, 
                                                            stratify=data_transformed['y_resampled'])

print(X_train.shape[0]/float(data_transformed['X_train_resampled'].shape[0]))
print(X_hold_out.shape[0]/float(data_transformed['X_train_resampled'].shape[0]))
print(y_train.shape[0]/float(data_transformed['y_resampled'].shape[0]))
print(y_hold_out.shape[0]/float(data_transformed['y_resampled'].shape[0]))

0.7999945980985307
0.20000540190146932
0.7999945980985307
0.20000540190146932


In [144]:
gbc = GradientBoostingClassifier()
get_scores(X_train, y_train, gbc, 'gbc_result', scores)
scores

Calculating scores...
Done!


Unnamed: 0,precision test,precision train,recall test,recall train,f1-score test,f1-score train,PR-score test,PR-score train,ROC AUC test,ROC AUC train
LE_all_cols,0.277857,0.954656,0.00462185,0.0173245,0.00908926,0.0340269,0.176807,0.300872,0.719549,0.780733
LE_top_20_num,0.498333,0.975909,0.00588059,0.0133346,0.0116149,0.0263056,0.178693,0.239818,0.712654,0.747665
LE_top_20_num_top_20_cat,0.481429,0.975909,0.00546042,0.0133346,0.0107919,0.0263056,0.178917,0.239818,0.71269,0.747665
LE_OHE_top_20_num_top_20_cat,0.381602,0.929314,0.0084016,0.0140699,0.0164207,0.0277036,0.197827,0.367195,0.72685,0.815824
LE_top_20_num_top_20_cat_OverSampling,0.655916,0.659534,0.714946,0.718736,0.684114,0.687837,0.719143,0.724094,0.739909,0.744958
LE_OHE_top_20_num_top_20_cat_OverSampling,0.685121,0.690455,0.704615,0.71217,0.694729,0.701142,0.749707,0.759348,0.766268,0.774798
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05,0.672929,0.67755,0.699551,0.704084,0.68598,0.69056,0.738607,0.745166,0.752393,0.757689
LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5,0.687044,0.692741,0.706709,0.710963,0.696735,0.701731,0.752413,0.761964,0.770079,0.778302
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5,0.673707,0.678013,0.697897,0.703476,0.685581,0.690505,0.740598,0.746502,0.753691,0.758713
gbc_result,0.685077,0.690455,0.704615,0.71217,0.694706,0.701142,0.749717,0.759348,0.766273,0.774798


In [145]:
gbc.fit(X_train, y_train)
get_scores_hold_out(X_hold_out, y_hold_out, gbc, 'gbc_result', scores_hold_out)
scores_hold_out

Calculating scores...
Done!


Unnamed: 0,precision,recall,f1-score,PR-score,ROC AUC
LE_all_cols,0.272727,0.00504202,0.00990099,0.172544,0.723504
LE_top_20_num,0.833333,0.00840336,0.0166389,0.179318,0.723015
LE_top_20_num_top_20_cat,0.833333,0.00840336,0.0166389,0.179293,0.72296
LE_OHE_top_20_num_top_20_cat,0.428571,0.00504202,0.00996678,0.185482,0.726427
LE_top_20_num_top_20_cat_OverSampling,0.663269,0.729372,0.694752,0.724446,0.746608
LE_OHE_top_20_num_top_20_cat_OverSampling,0.692571,0.711276,0.701799,0.757314,0.772186
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05,0.683949,0.705469,0.694542,0.746759,0.759108
LE_OHE_top_20_num_top_20_cat_OverSampling_ss0.5,0.693808,0.715733,0.7046,0.758046,0.774817
LE_OHE_top_20_num_top_20_cat_OverSampling_lr0.05_ss0.5,0.681724,0.707225,0.69424,0.746513,0.759469
gbc_result,0.692571,0.711276,0.701799,0.757275,0.772132


In [146]:
# Расчет вероятности принадлежности к классу "отток"

y_pred_gbc = gbc.predict_proba(data_transformed['X_test'])[:, 1]

In [147]:
len(y_pred_gbc)

10000

In [148]:
# Запись результатов в файл

results_gbc = pd.DataFrame(y_pred_gbc, columns = ['result'])
results_gbc.index.name = 'Id'
results_gbc.to_csv('week4_gbc_LE_OHE_top_20_num_top_20_cat_OverSampling_26_08_2018.csv')

In [149]:
results_gbc.head()

Unnamed: 0_level_0,result
Id,Unnamed: 1_level_1
0,0.424904
1,0.58182
2,0.229808
3,0.490462
4,0.179715
