In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.model_selection import GridSearchCV
from sklearn import feature_selection as fs
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

Загрузим тренировочные и тестовые данные

In [2]:
test = pd.read_csv('orange_small_churn_test_data.csv')
train = pd.read_csv('orange_small_churn_data.txt')

train.shape, test.shape

((40000, 230), (10000, 231))

In [3]:
test.drop(columns='ID', inplace=True)

Объединим выборки для дальнейшего рассмотрения.
Удалим из рассмотрения признаки с нулевыми корреляциями.

In [4]:
data = pd.concat([train, test], axis=0)
data.shape

(50000, 230)

In [5]:
data.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var183,Var184,Var185,Var186,Var187,Var188,Var189,Var190,Var209,Var230
count,702.0,1241.0,1240.0,1579.0,1487.0,44471.0,44461.0,0.0,702.0,1487.0,...,1241.0,1241.0,0.0,702.0,702.0,1241.0,21022.0,333.0,0.0,0.0
mean,11.487179,0.004029,425.298387,0.125396,238793.3,1326.437116,6.809496,,48.145299,392605.7,...,77773.8,8.460919,,3.299145,16.54416,167.368477,270.142137,22007.045192,,
std,40.709951,0.141933,4270.193518,1.275481,644125.9,2685.693668,6.326053,,154.777855,928089.6,...,201618.8,46.973777,,8.781967,60.22303,113.980072,86.707692,29085.14649,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.0,0.0,,0.0,0.0,-6.42,6.0,0.0,,
25%,0.0,0.0,0.0,0.0,0.0,518.0,0.0,,4.0,0.0,...,0.0,0.0,,0.0,0.0,19.38,204.0,2732.67,,
50%,0.0,0.0,0.0,0.0,0.0,861.0,7.0,,20.0,0.0,...,0.0,0.0,,0.0,4.0,197.64,270.0,12668.94,,
75%,16.0,0.0,0.0,0.0,118742.5,1428.0,7.0,,46.0,262863.0,...,48810.0,8.0,,6.0,14.0,252.96,330.0,29396.34,,
max,680.0,5.0,130668.0,27.0,6048550.0,131761.0,140.0,,2300.0,12325590.0,...,3048400.0,1200.0,,102.0,910.0,628.62,642.0,230427.0,,


In [6]:
fts_null = ['Var209', 'Var230', 'Var31', 'Var167', 'Var32', 'Var175', 'Var15', 'Var39', 'Var185',
       'Var42', 'Var48', 'Var8', 'Var141', 'Var79', 'Var55', 'Var169', 'Var52',
       'Var20']

In [7]:
data = data.drop(columns=fts_null)
data.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
count,702.0,1241.0,1240.0,1579.0,1487.0,44471.0,44461.0,702.0,1487.0,1240.0,...,702.0,44991.0,1579.0,1241.0,1241.0,702.0,702.0,1241.0,21022.0,333.0
mean,11.487179,0.004029,425.298387,0.125396,238793.3,1326.437116,6.809496,48.145299,392605.7,8.625806,...,3776755.0,0.611456,1416638.0,77773.8,8.460919,3.299145,16.54416,167.368477,270.142137,22007.045192
std,40.709951,0.141933,4270.193518,1.275481,644125.9,2685.693668,6.326053,154.777855,928089.6,2.869558,...,3785696.0,2.495681,2279786.0,201618.8,46.973777,8.781967,60.22303,113.980072,86.707692,29085.14649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.42,6.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,518.0,0.0,4.0,0.0,8.0,...,191735.2,0.0,0.0,0.0,0.0,0.0,0.0,19.38,204.0,2732.67
50%,0.0,0.0,0.0,0.0,0.0,861.0,7.0,20.0,0.0,8.0,...,2431310.0,0.0,116778.0,0.0,0.0,0.0,4.0,197.64,270.0,12668.94
75%,16.0,0.0,0.0,0.0,118742.5,1428.0,7.0,46.0,262863.0,8.0,...,6471827.0,0.0,1844952.0,48810.0,8.0,6.0,14.0,252.96,330.0,29396.34
max,680.0,5.0,130668.0,27.0,6048550.0,131761.0,140.0,2300.0,12325590.0,40.0,...,14284830.0,49.0,11994780.0,3048400.0,1200.0,102.0,910.0,628.62,642.0,230427.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 9999
Columns: 212 entries, Var1 to Var229
dtypes: float64(173), int64(1), object(38)
memory usage: 81.3+ MB


Выделим числовые и категорийные признаки

In [9]:
data_num = data.ix[:, :'Var190']
data_cat = data.ix[:,'Var191':]


In [10]:
data_num.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
count,702.0,1241.0,1240.0,1579.0,1487.0,44471.0,44461.0,702.0,1487.0,1240.0,...,702.0,44991.0,1579.0,1241.0,1241.0,702.0,702.0,1241.0,21022.0,333.0
mean,11.487179,0.004029,425.298387,0.125396,238793.3,1326.437116,6.809496,48.145299,392605.7,8.625806,...,3776755.0,0.611456,1416638.0,77773.8,8.460919,3.299145,16.54416,167.368477,270.142137,22007.045192
std,40.709951,0.141933,4270.193518,1.275481,644125.9,2685.693668,6.326053,154.777855,928089.6,2.869558,...,3785696.0,2.495681,2279786.0,201618.8,46.973777,8.781967,60.22303,113.980072,86.707692,29085.14649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.42,6.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,518.0,0.0,4.0,0.0,8.0,...,191735.2,0.0,0.0,0.0,0.0,0.0,0.0,19.38,204.0,2732.67
50%,0.0,0.0,0.0,0.0,0.0,861.0,7.0,20.0,0.0,8.0,...,2431310.0,0.0,116778.0,0.0,0.0,0.0,4.0,197.64,270.0,12668.94
75%,16.0,0.0,0.0,0.0,118742.5,1428.0,7.0,46.0,262863.0,8.0,...,6471827.0,0.0,1844952.0,48810.0,8.0,6.0,14.0,252.96,330.0,29396.34
max,680.0,5.0,130668.0,27.0,6048550.0,131761.0,140.0,2300.0,12325590.0,40.0,...,14284830.0,49.0,11994780.0,3048400.0,1200.0,102.0,910.0,628.62,642.0,230427.0


In [11]:
data_cat.describe()

Unnamed: 0,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
count,1083,49631,50000,12784,50000,50000,49857,50000,49996,24592,...,50000,50000,50000,44789,820,23856,50000,50000,50000,21568
unique,1,361,51,3,23,4,225,4291,5073,15415,...,4291,7,4291,4,1,3,23,7,30,4
top,r__I,qFpmfo8zhV,RO12,SEuy,taul,1K8T,0Xwj,fhk21Ss,r83_sZi,yP09M03,...,4UxGlow,oslk,catzS2D,LM8l689qOp,4n2X,ELof,FSa2,RAYp,F2FyR07IdsN7I,am7c
freq,1083,385,35964,12567,47958,49550,4629,4441,955,73,...,4441,37009,4441,36608,820,11072,8031,35156,32703,11689


В категорийных пропуски заменим на "0"

In [12]:
data_cat.fillna('0', inplace=True)

закодируем категорийные признаки. в задаче baseline было установлено, что кодировать лучше в числа.

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
data_cat_le = pd.DataFrame()
for i in data_cat.columns:
    le = LabelEncoder()
    le.fit(data_cat[i])
    data_cat_le[i] = le.transform(data_cat[i])
    
data_cat_le.describe()


Unnamed: 0,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.02166,173.8533,20.9943,0.51406,21.50848,0.0263,103.60076,2223.29334,2498.1484,3788.87082,...,2003.12184,3.98408,2123.70206,1.30248,0.0164,0.7799,10.11572,2.10306,10.2334,0.6312
std,0.145572,107.065246,10.322155,0.879555,2.588865,0.27837,71.634777,1197.29489,1381.535728,4962.391785,...,1248.04682,1.119166,1183.541455,1.070238,0.127009,0.933506,5.798438,0.867994,6.488254,0.799074
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,82.0,25.0,0.0,22.0,0.0,29.0,1187.0,1292.0,0.0,...,826.0,4.0,1066.0,1.0,0.0,0.0,6.0,2.0,8.0,0.0
50%,0.0,165.0,25.0,0.0,22.0,0.0,110.0,2435.0,2456.5,0.0,...,2014.0,4.0,2230.0,1.0,0.0,0.0,10.0,2.0,8.0,0.0
75%,0.0,266.0,25.0,2.0,22.0,0.0,178.0,3119.0,3697.0,7574.25,...,3047.0,4.0,2984.25,1.0,0.0,2.0,14.0,2.0,8.0,1.0
max,1.0,361.0,50.0,3.0,22.0,3.0,225.0,4290.0,5073.0,15415.0,...,4290.0,6.0,4290.0,4.0,1.0,3.0,22.0,6.0,29.0,4.0


In [15]:
data_num.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var180,Var181,Var182,Var183,Var184,Var186,Var187,Var188,Var189,Var190
0,,,,,,3052.0,,,,,...,,0.0,,,,,,,,
1,,,,,,1813.0,7.0,,,,...,,0.0,,,,,,,276.0,
2,,,,,,1953.0,7.0,,,,...,,0.0,,,,,,,,
3,,,,,,1533.0,7.0,,,,...,,0.0,,,,,,,,
4,,,,,,686.0,7.0,,,,...,,0.0,,,,,,,,


Пропуски в числовых признаках заменим на среднее


In [16]:
for i in data_num.columns:
    current_mean = data_num.ix[:, i].mean()
    data_num.ix[:, i].fillna(current_mean, inplace=True)

In [17]:
data_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 9999
Columns: 174 entries, Var1 to Var190
dtypes: float64(173), int64(1)
memory usage: 66.8 MB


объединим датафрейм с закодированными данными

In [18]:
data_1 = pd.DataFrame(np.hstack((data_num, data_cat_le)), columns=data.columns)
data_1.shape

(50000, 212)

In [19]:
data_1.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
0,11.487179,0.004029,425.298387,0.125396,238793.32885,3052.0,6.809496,48.145299,392605.656355,8.625806,...,3129.0,0.0,4067.0,1.0,0.0,0.0,14.0,0.0,29.0,0.0
1,11.487179,0.004029,425.298387,0.125396,238793.32885,1813.0,7.0,48.145299,392605.656355,8.625806,...,3138.0,4.0,379.0,1.0,0.0,1.0,22.0,2.0,2.0,2.0
2,11.487179,0.004029,425.298387,0.125396,238793.32885,1953.0,7.0,48.145299,392605.656355,8.625806,...,350.0,6.0,2751.0,1.0,0.0,0.0,7.0,3.0,25.0,2.0
3,11.487179,0.004029,425.298387,0.125396,238793.32885,1533.0,7.0,48.145299,392605.656355,8.625806,...,4232.0,4.0,2864.0,1.0,0.0,0.0,22.0,2.0,8.0,0.0
4,11.487179,0.004029,425.298387,0.125396,238793.32885,686.0,7.0,48.145299,392605.656355,8.625806,...,1797.0,4.0,1506.0,1.0,0.0,0.0,12.0,2.0,8.0,0.0


In [20]:
data_1.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.487179,0.004029,425.298387,0.125396,238793.3,1326.437116,6.809496,48.145299,392605.7,8.625806,...,2003.12184,3.98408,2123.70206,1.30248,0.0164,0.7799,10.11572,2.10306,10.2334,0.6312
std,4.820354,0.022352,672.206258,0.226593,111045.1,2532.849475,5.965363,18.326822,159999.5,0.451721,...,1248.04682,1.119166,1183.541455,1.070238,0.127009,0.933506,5.798438,0.867994,6.488254,0.799074
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.487179,0.004029,425.298387,0.125396,238793.3,581.0,0.0,48.145299,392605.7,8.625806,...,826.0,4.0,1066.0,1.0,0.0,0.0,6.0,2.0,8.0,0.0
50%,11.487179,0.004029,425.298387,0.125396,238793.3,945.0,7.0,48.145299,392605.7,8.625806,...,2014.0,4.0,2230.0,1.0,0.0,0.0,10.0,2.0,8.0,0.0
75%,11.487179,0.004029,425.298387,0.125396,238793.3,1326.437116,7.0,48.145299,392605.7,8.625806,...,3047.0,4.0,2984.25,1.0,0.0,2.0,14.0,2.0,8.0,1.0
max,680.0,5.0,130668.0,27.0,6048550.0,131761.0,140.0,2300.0,12325590.0,40.0,...,4290.0,6.0,4290.0,4.0,1.0,3.0,22.0,6.0,29.0,4.0


Вернем разбиение train и test

In [21]:
train = data_1.ix[:39999, :]
test = data_1.ix[40000:, :]
train.shape, test.shape

((40000, 212), (10000, 212))

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 212 entries, Var1 to Var229
dtypes: float64(212)
memory usage: 64.7 MB


In [23]:
labels = pd.read_csv('orange_small_churn_labels.txt', header=None)

In [24]:
labels.head()

Unnamed: 0,0
0,-1
1,-1
2,-1
3,1
4,-1


Сократим количество признаков

In [25]:
from sklearn.svm import LinearSVC

In [26]:
sfm = fs.SelectFromModel(LinearSVC(penalty="l1", dual=False))
X = sfm.fit_transform(train, labels)
X.shape

(40000, 142)

Импортируем нужные нам модели

In [27]:
from sklearn.linear_model import RidgeClassifier as RC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBS
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

В задании baseline установили, что для кросс-валидации лучше использовать StratifiedKFold

In [28]:
cross_val = StratifiedKFold(n_splits=3, random_state=13)

Из метрик качества оставим только AUC-PRC. И посмотрим все модели.

In [29]:
rc = RC(random_state=13)
rfc = RFC(random_state=13)
gbs = GBS(random_state=13)
estimators = [rc, rfc, gbs]
current_score = []
for estim in estimators:
    print('Текущая модель', estim)
    current = cross_val_score(estim, train, labels, scoring = 'average_precision', cv = cross_val)
    current_score.append(current.mean())

Текущая модель RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=13, solver='auto',
        tol=0.001)
Текущая модель RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=13, verbose=0, warm_start=False)
Текущая модель GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
       

In [30]:
current_score

[0.14061107703966044, 0.10241796924644964, 0.19779511460025848]

Как видим, лучший результат у GradientBoostingClassifier. Оставим эту модель для дальнейшего рассмотрения

In [31]:
parameters={"max_depth": [5, 10, 15],
    'min_samples_leaf' : [1, 4, 5, 7, 10],
    'n_estimators' : [100, 200, 300]}

In [32]:
mod = GridSearchCV(GBS(random_state=13), param_grid=parameters, scoring='average_precision', cv=cross_val, n_jobs=-1)
%time mod.fit(X, labels)

CPU times: user 42.9 s, sys: 466 ms, total: 43.4 s
Wall time: 2h 14min 50s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=13, shuffle=False),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [5, 10, 15], 'min_samples_leaf': [1, 4, 5, 7, 10], 'n_estimators': [100, 200, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='average_precision', verbose=0)

In [33]:
print( mod.best_score_)
print (mod.best_params_)

0.21119003630683114
{'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 100}


In [34]:
final_model = GBS(max_depth=5, n_estimators=100, min_samples_leaf=7, random_state=13)
final_model.fit(X, labels)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=7, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=13,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

Расчитаем ROC-AUC

In [35]:
cross_val_score(final_model, X, labels, scoring = metrics.scorer.roc_auc_scorer, cv = cross_val).mean()

0.7282572781076097

Сократим количество признаков у тестовых данных и сделаем прогноз

In [36]:
test = test.ix[:, sfm.get_support()]
test.shape

(10000, 142)

In [37]:
result = final_model.predict_proba(test)

In [38]:
res_data = pd.DataFrame()
res_data.head()

In [39]:
res_data['ID'] = range(len(result))
res_data['result'] = result[:,1]

In [40]:
res_data.head()

Unnamed: 0,ID,result
0,0,0.084701
1,1,0.117179
2,2,0.027984
3,3,0.071522
4,4,0.015797


In [41]:
res_data.to_csv('result6.csv', index=False)

Конечный результат: 0.71435