In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import xgboost as xgb
import sklearn as skl
from sklearn import tree
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score,recall_score,confusion_matrix

In [3]:
train_transaction = pd.read_csv('transactions_train.csv')
train_target = pd.read_csv('train_target.csv')

* client_id - уникальный идентификатор клиента
* trans_date - дата совершения транзакции
* small_group - категория покупки
* amount_rur - сумма транзакции

In [4]:
train_transaction.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


* client_id - уникальный идентификатор клиента, соответствует полю client_id из транзакций
* bins - целевая переменная, которую нужно предсказать, это категория возраста клиента

In [5]:
train_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [6]:
agg_features =train_transaction.groupby('client_id')['amount_rur'].agg(['std','mean','max','min','sum']).reset_index()
agg_features.head()

Unnamed: 0,client_id,std,mean,max,min,sum
0,4,73.511624,39.450168,1341.802,0.043,28404.121
1,6,26.200397,21.535259,315.781,0.045,15720.739
2,7,253.261383,69.379089,4505.971,0.043,53630.036
3,10,63.191701,48.752642,654.893,0.045,34419.365
4,11,107.395139,32.991877,2105.058,0.388,26789.404


In [7]:
counter_df = train_transaction.groupby(['client_id','small_group'])['amount_rur'].count()
pivot_counts_train = counter_df.reset_index().pivot(index='client_id',columns='small_group',values='amount_rur')
pivot_counts_train = pivot_counts_train.fillna(0)
pivot_counts_train.columns=['small_group_'+str(i) for i in pivot_counts_train.columns]
pivot_counts_train.head()

Unnamed: 0_level_0,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,447.0,1.0,44.0,93.0,0.0,0.0,0.0,1.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,397.0,0.0,172.0,10.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,79.0,5.0,27.0,19.0,1.0,0.0,2.0,1.0,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,12.0,309.0,1.0,71.0,65.0,0.0,0.0,0.0,3.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2.0,423.0,0.0,59.0,23.0,3.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train = pd.merge(train_target,agg_features,on = 'client_id')

In [9]:
train = pd.merge(train, pivot_counts_train.reset_index(), on = 'client_id')

In [10]:
train.head()

Unnamed: 0,client_id,bins,std,mean,max,min,sum,small_group_0,small_group_1,small_group_2,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
0,24662,2,72.037354,34.774725,1227.314,0.074,30254.011,0.0,174.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1046,0,106.540962,52.015367,1210.506,0.55,42548.57,1.0,187.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34089,2,59.92745,34.325852,782.641,0.043,26842.816,0.0,372.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34848,1,14.224936,16.16099,109.59,0.043,15773.126,0.0,359.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47076,3,35.473591,15.92905,541.165,0.432,12488.375,0.0,378.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
test_transaction = pd.read_csv('transactions_test.csv')
test_id = pd.read_csv('test.csv')

In [12]:
test_agg_features =test_transaction.groupby('client_id')['amount_rur'].agg(['std','mean','max','min','sum']).reset_index()


In [13]:
counter_df = test_transaction.groupby(['client_id','small_group'])['amount_rur'].count()
pivot_counts_test = counter_df.reset_index().pivot(index='client_id',columns='small_group',values='amount_rur')
pivot_counts_test = pivot_counts_test.fillna(0)
pivot_counts_test.columns=['small_group_'+str(i) for i in pivot_counts_test.columns]
pivot_counts_test.head()

Unnamed: 0_level_0,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,...,small_group_192,small_group_193,small_group_194,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_201,small_group_202
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,226.0,1.0,36.0,9.0,0.0,0.0,0.0,2.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30.0,326.0,0.0,40.0,56.0,0.0,0.0,0.0,0.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21.0,242.0,1.0,50.0,48.0,4.0,0.0,6.0,1.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,156.0,83.0,48.0,31.0,2.0,0.0,1.0,2.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16.0,398.0,1.0,23.0,25.0,0.0,0.0,0.0,5.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
test = pd.merge(test_id,test_agg_features,on = 'client_id')

In [15]:
test = pd.merge(test, pivot_counts_test.reset_index(), on = 'client_id')

In [16]:
test.head()

Unnamed: 0,client_id,std,mean,max,min,sum,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_192,small_group_193,small_group_194,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_201,small_group_202
0,28571,43.659666,42.488974,306.882,0.078,30507.083,0.0,278.0,13.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27046,93.600961,38.988135,1469.007,0.043,39378.016,9.0,193.0,68.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13240,198.58363,53.302683,3902.918,1.078,50211.127,0.0,227.0,3.0,165.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,19974,273.597147,53.252924,5865.551,0.432,45371.491,42.0,305.0,12.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10505,119.40872,81.950972,1921.341,0.043,90883.628,0.0,516.0,56.0,162.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
common_features=list(set(train.columns).intersection(set(test.columns)))

1 Вариант

In [18]:
y_train = train['bins']
X_train = train[common_features]
X_test=test[common_features]

In [19]:
param={'objective':'multi:softprob','num_class':4,'n_jobs':4,'seed':42}

In [20]:
%%time
model=xgb.XGBClassifier(**param,n_estimators=300)
model.fit(X_train,y_train)



Wall time: 1min 45s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=4,
              num_class=4, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, seed=42, subsample=1, tree_method='exact',
              validate_parameters=1, ...)

In [21]:
model.score(X_train,y_train)

0.9869

In [22]:
pred=model.predict(X_test)
pred

array([0, 2, 3, ..., 2, 2, 0], dtype=int64)

2 Вариант

In [23]:
parameters = {'n_estimators': range(10,30,10), 
              'max_depth' : range(1,8,2),
              'min_samples_leaf':range(1,5),
              'min_samples_split':range(2,6,2)}

In [24]:
clf = RandomForestClassifier(random_state=0)

In [25]:
grid_search = GridSearchCV(clf,parameters,cv=4)

In [26]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=0),
             param_grid={'max_depth': range(1, 8, 2),
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': range(2, 6, 2),
                         'n_estimators': range(10, 30, 10)})

In [27]:
best_clf = grid_search.best_estimator_

In [28]:
pred2 = best_clf.predict(X_test)
y_train.shape

(30000,)

In [29]:
y_predicted_prob = best_clf.predict_proba(X_test)
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(pred, y_predicted_prob[:,1])
roc_auc= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

ValueError: multiclass format is not supported

In [30]:
pred2


array([0, 2, 3, ..., 2, 2, 1], dtype=int64)