In [1]:
import pandas as pd

Считаем данные по транзакциям и правильные метки возрастных категорий.

In [2]:
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [3]:
train_target = pd.read_csv('../data/train_target.csv')

In [4]:
transactions_train.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


* client_id - уникальный идентификатор клиента
* trans_date - дата совершения транзакции
* small_group - категория покупки
* amount_rur - сумма транзакции

In [5]:
train_target.head(5)

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


* client_id - уникальный идентификатор клиента, соответствует полю client_id из транзакций
* bins - целевая переменная, которую нужно предсказать, это категория возраста клиента

Посчитаем по каждому клиенту самые простые аггрегационные признаки.

In [6]:
agg_features=transactions_train.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()

In [7]:
agg_features.head()

Unnamed: 0,client_id,sum,mean,std,min,max
0,4,28404.121,39.450168,73.511624,0.043,1341.802
1,6,15720.739,21.535259,26.200397,0.045,315.781
2,7,53630.036,69.379089,253.261383,0.043,4505.971
3,10,34419.365,48.752642,63.191701,0.045,654.893
4,11,26789.404,32.991877,107.395139,0.388,2105.058


Посчитаем для каждого клиента количество транзакций по каждой категории.

In [8]:
counter_df_train=transactions_train.groupby(['client_id','small_group'])['amount_rur'].count()

In [9]:
cat_counts_train=counter_df_train.reset_index().pivot(index='client_id', \
                                                      columns='small_group',values='amount_rur')

In [10]:
cat_counts_train=cat_counts_train.fillna(0)

In [11]:
cat_counts_train.columns=['small_group_'+str(i) for i in cat_counts_train.columns]

In [12]:
cat_counts_train.head()

Unnamed: 0_level_0,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,447.0,1.0,44.0,93.0,0.0,0.0,0.0,1.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,397.0,0.0,172.0,10.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,79.0,5.0,27.0,19.0,1.0,0.0,2.0,1.0,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,12.0,309.0,1.0,71.0,65.0,0.0,0.0,0.0,3.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2.0,423.0,0.0,59.0,23.0,3.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Далее соединим все файлы в один датафрейм с таргетом.

In [13]:
train=pd.merge(train_target,agg_features,on='client_id')

In [14]:
train=pd.merge(train,cat_counts_train.reset_index(),on='client_id')

In [15]:
train.head()

Unnamed: 0,client_id,bins,sum,mean,std,min,max,small_group_0,small_group_1,small_group_2,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
0,24662,2,30254.011,34.774725,72.037354,0.074,1227.314,0.0,174.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1046,0,42548.57,52.015367,106.540962,0.55,1210.506,1.0,187.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34089,2,26842.816,34.325852,59.92745,0.043,782.641,0.0,372.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34848,1,15773.126,16.16099,14.224936,0.043,109.59,0.0,359.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47076,3,12488.375,15.92905,35.473591,0.432,541.165,0.0,378.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:

import numpy as np


# импорт моделей
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import mean_squared_error, confusion_matrix


In [17]:
itog_val = {}
k_fold = 3
random_state = 777

In [18]:
X = train.drop('bins', axis=1)
y = train['bins']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 777)
print(X_train.shape, y_train.shape)

(20100, 208) (20100,)


In [20]:



def f_rmse_score(model, X_train, X_test, y_train, y_test):
    #среднеквадратичная ошибка или среднеквадратичное отклонение оценщика измеряет среднее значение квадратов ошибок, 
    #то есть среднеквадратическую разницу между оценочными значениями и фактическим значением. 
    mse_train = mean_squared_error(y_true = y_train,
                                  y_pred = model.predict(X_train))
    mse_test = mean_squared_error(y_true = y_test,
                                  y_pred = model.predict(X_test))
    rmse_train = mse_train ** 0.5
    rmse_test = mse_test ** 0.5
    
    print("The training RMSE for " + str(model) + " is: " + str(rmse_train))
    print("The testing RMSE for " + str(model) + " is: " + str(rmse_test))
    return (rmse_train, rmse_test)



def f_err_predict_test_train(model, X_train, X_test, y_train, y_test):
    # ошибки на предсказания меток моделью
    
    err_train = np.mean(y_train != model.predict(X_train))
    err_test  = np.mean(y_test  != model.predict(X_test))
    
    print("ошибки на обучающей: {0:.2f}%".format(err_train*100))
    print("ошибки на тестовой: {0:.2f}%".format(err_test*100))
    
    
      
def f_cross_val_scores(model, X, y, k_fold=3):
    #
    scoring = 'accuracy'
    rslt = cross_val_score(model, X, y, cv=k_fold, n_jobs=-1, scoring=scoring)
    print(f'Результат кроссвалидации = {rslt}')
    #print(f'Среднее значение кроссвализации {rslt.mean()}')
    
    return rslt.mean()
     

In [96]:

# модель KNeighborsClassifier
model_knc = KNeighborsClassifier(n_neighbors=33)
predict = model_knc.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_knc, X_train, X_test, y_train, y_test)

f_err_predict_test_train(model_knc, X_train, X_test, y_train, y_test)
#itog_val['KNeighborsClassifier'] = f_cross_val_scores(model_knc, X, y, k_fold)

print(accuracy_score(y_test, predict))   #0.3644

ошибки на обучающей: 57.07%
ошибки на тестовой: 63.56%
0.36444444444444446


In [22]:

# модель DecisionTreeClassifier
model_dtc = DecisionTreeClassifier()
predict = model_dtc.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_dtc, X_train, X_test, y_train, y_test)

f_err_predict_test_train(model_dtc, X_train, X_test, y_train, y_test)
itog_val['DecisionTreeClassifier'] = f_cross_val_scores(model_dtc, X, y, k_fold)

print(accuracy_score(y_test, predict))  #0.4659

ошибки на обучающей: 0.00%
ошибки на тестовой: 53.40%
Результат кроссвалидации = [0.4692 0.4602 0.4674]
0.46595959595959596


In [117]:
#model RandomForestClassifier
model_rfc = RandomForestClassifier(max_depth = 33, 
                                   min_samples_leaf = 8,
                                   min_samples_split = 2,
                                   n_estimators = 600,
                                   random_state=random_state)

predict = model_rfc.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_rfc, X_train, X_test, y_train, y_test)

f_err_predict_test_train(model_rfc, X_train, X_test, y_train, y_test)
#itog_val['RandomForestClassifier'] = f_cross_val_scores(model_rfc, X, y, k_fold)

print(accuracy_score(y_test, predict))   #0.5924

ошибки на обучающей: 12.19%
ошибки на тестовой: 40.76%
0.5924242424242424


In [115]:
#model Naive Bayes
model_nb = GaussianNB()
predict = model_nb.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_nb, X_train, X_test, y_train, y_test)

f_err_predict_test_train(model_nb, X_train, X_test, y_train, y_test)
#itog_val['NaiveBayes'] = f_cross_val_scores(model_nb, X, y, k_fold)

print(accuracy_score(y_test, predict))  #0.38


ошибки на обучающей: 60.95%
ошибки на тестовой: 61.93%
0.3807070707070707


In [85]:
# модель LogisticRegression
model_lr = LogisticRegression(C=0.01, penalty='l2', tol=0.0001, random_state=random_state) 

predict = model_lr.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_lr, X_train, X_test, y_train, y_test)
f_err_predict_test_train(model_lr, X_train, X_test, y_train, y_test)
#itog_val['LogisticRegression'] = f_cross_val_scores(model_lr, X, y, k_fold)

print(accuracy_score(y_test, predict))  #0.5979



ошибки на обучающей: 43.64%
ошибки на тестовой: 44.62%
0.5538383838383838


In [118]:

# модель GradientBoostingClassifier
model_gbc = GradientBoostingClassifier(learning_rate = 0.1, max_features = 21,
                                       min_samples_leaf = 8, min_samples_split = 2,
                                       n_estimators = 600, random_state=random_state)

predict = model_gbc.fit(X_train, y_train).predict(X_test)

#f_rmse_score(model_gbc, X_train, X_test, y_train, y_test)
f_err_predict_test_train(model_gbc, X_train, X_test, y_train, y_test)
#itog_val['GradientBoostingClassifier'] = f_cross_val_scores(model_gbc, X, y, k_fold)

print(accuracy_score(y_test, predict)) # 0.6115

ошибки на обучающей: 23.06%
ошибки на тестовой: 38.85%
0.6115151515151516


In [51]:

# модель XGBClassifier

model_xgbc = xgb.XGBClassifier(max_depth=10, min_child_weight=1,
                              n_estimators=400, n_jobs=-1,
                              verbose=1, learning_rate=0.15,
                              seed=42, random_state = random_state)

predict = model_xgbc.fit(X_train, y_train).predict(X_test)


#f_rmse_score(model_xgbc, X_train, X_test, y_train, y_test)

f_err_predict_test_train(model_xgbc, X_train, X_test, y_train, y_test)

#itog_val['XGBClassifier'] = f_cross_val_scores(model_xgbc, X, y, k_fold)

print(accuracy_score(y_test, predict))  #0.6071



ошибки на обучающей: 0.00%
ошибки на тестовой: 39.28%
0.6071717171717171


In [28]:
# Перекресная кросвалидация моделей
X.from_dict(data = itog_val, orient='index').plot(kind='bar', legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x12082e2b0>

Теперь подгрузим тестовые данные 

In [119]:
transactions_test=pd.read_csv('../data/transactions_test.csv')

test_id=pd.read_csv('../data/test.csv')

In [120]:
agg_features_test=transactions_test.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()

In [122]:
counter_df_test=transactions_test.groupby(['client_id','small_group'])['amount_rur'].count()

In [123]:
cat_counts_test=counter_df_test.reset_index().pivot(index='client_id', columns='small_group',values='amount_rur')

In [124]:
cat_counts_test=cat_counts_test.fillna(0)

In [125]:
cat_counts_test.columns=['small_group_'+str(i) for i in cat_counts_test.columns]

In [126]:
cat_counts_test.head()

Unnamed: 0_level_0,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,...,small_group_192,small_group_193,small_group_194,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_201,small_group_202
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,226.0,1.0,36.0,9.0,0.0,0.0,0.0,2.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30.0,326.0,0.0,40.0,56.0,0.0,0.0,0.0,0.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21.0,242.0,1.0,50.0,48.0,4.0,0.0,6.0,1.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,156.0,83.0,48.0,31.0,2.0,0.0,1.0,2.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16.0,398.0,1.0,23.0,25.0,0.0,0.0,0.0,5.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
test = pd.merge(test_id,agg_features_test,on='client_id')

In [128]:
test = pd.merge(test,cat_counts_test.reset_index(),on='client_id')

In [129]:
common_features = list(set(train.columns).intersection(set(test.columns)))

In [130]:
y_train=train['bins']
X_train=train[common_features]
X_test=test[common_features]

In [131]:
%%time
predict = model_rfc.fit(X_train, y_train).predict(X_test)


CPU times: user 55.1 s, sys: 688 ms, total: 55.8 s
Wall time: 1min


### Подготовим файл для отправки в систему

In [132]:
result = pd.DataFrame({'bins': predict}, index=test.client_id)
result.head()

Unnamed: 0_level_0,bins
client_id,Unnamed: 1_level_1
28571,0
27046,2
13240,3
19974,0
10505,1


In [133]:
from datetime import datetime
import os


date_current = datetime.today().strftime('%d_%m')
file_name = '../data_out/predict_{}.csv'.format(date_current)

if not os.path.exists('../data_out'):
    os.makedirs('../data_out')

print(file_name)
result.to_csv(file_name, index=True)

../data_out/predict_29_12.csv
