In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, linear_model, model_selection
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

In [3]:

# Set random seed to ensure reproducible runs
RSEED = 50
FILEPATH = '../ExternalFiles/xlsx/DataSet_w_NA.xlsx'
check_path = os.path.isfile(FILEPATH)

df = None
if check_path:
    df = pd.read_excel(FILEPATH, sheet_name="Испорченные факты")
    
df = df.dropna()
df =pd.pivot_table(df, values=['Продажи, руб', 'Продажи, шт','Повторение заказа','Маржинальная прибыль','Повторение товара'], index=["Факты.Товар ID"],
                     aggfunc={'Продажи, шт': [np.median, np.sum],
                              'Продажи, руб': np.sum,
                              'Повторение заказа': np.sum,
                              'Маржинальная прибыль': np.sum
                        
                               })

newname=df.columns.map('_'.join)
df.columns=newname
df=df.reset_index()
# print(df)

# --------------
total_sale=df['Продажи, руб_sum'].sum()
df['Доля']=df['Продажи, руб_sum']/total_sale * 100
df = df.sort_values(by=('Продажи, руб_sum'), ascending=False)
df=df.assign(sum_d=df['Доля'].cumsum())
df.loc[(df['sum_d'] <= 80), 'ABC'] = 'A'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'ABC'] = 'B'
df.loc[(df['sum_d'] > 95), 'ABC'] = 'C'
# print(df)
# --------------
df['Стоимость, руб'] = df['Продажи, руб_sum']/df['Продажи, шт_sum']
df['Продажи в следующем периоде']= (df['Продажи, шт_sum'] + df['Продажи, шт_median']) *df['Стоимость, руб']
total_sale_next=df['Продажи в следующем периоде'].sum()
df['Доля_будущая']=df['Продажи в следующем периоде']/total_sale_next * 100
df = df.sort_values(by=('Продажи в следующем периоде'), ascending=False)
df=df.assign(sum_d_next=df['Доля_будущая'].cumsum())
df.loc[(df['sum_d_next'] <= 80), 'ABC_next'] = 'A'
df.loc[(df['sum_d_next'] > 80) & (df['sum_d_next'] <= 95), 'ABC_next'] = 'B'
df.loc[(df['sum_d_next'] > 95), 'ABC_next'] = 'C'

df.loc[(df['ABC'] != df['ABC_next']), 'Изменение класса'] = 1
df.loc[(df['ABC'] == df['ABC_next']), 'Изменение класса'] = 0

df.loc[(df['sum_d'] <= 80), 'class'] = '0'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'class'] = '1'
df.loc[(df['sum_d'] > 95), 'class'] = '2'
df.loc[(df['sum_d_next'] <= 80), 'class_next'] = '0'

df.loc[(df['sum_d_next'] > 80) & (df['sum_d_next'] <= 95), 'class_next'] = '1'
df.loc[(df['sum_d_next'] > 95), 'class_next'] = '2'

alldata = df[['Продажи в следующем периоде', 'Доля_будущая', 'sum_d_next', 'class_next']]
alldata = alldata.sample(frac=1)

allinput = alldata[ ['Продажи в следующем периоде', 'Доля_будущая', 'sum_d_next'] ]
alloutput = alldata["class_next"]
print(allinput)
print(alloutput)
train, test, train_labels, test_labels  = train_test_split(allinput, alloutput, 
                                                          stratify = alloutput,
                                                          test_size = 0.3, 
                                                          random_state = RSEED)

tree = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1, max_depth=1)
tree.fit(train, train_labels)
n_nodes = []
max_depths = []

for ind_tree in tree.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

train_rf_predictions = tree.predict(train)
train_rf_probs = tree.predict_proba(train)[:, 1]

rf_predictions = tree.predict(test)
rf_probs = tree.predict_proba(test)[:, 1]

Y_test_predicted = tree.predict(test)

report = classification_report(test_labels, tree.predict(test), target_names=['A', 'B', 'C'])
print(report)

      Продажи в следующем периоде  Доля_будущая  sum_d_next
1126                 3.102821e+04      0.002260   99.706511
966                  1.800344e+06      0.131126   60.563114
1196                 2.640386e+05      0.019231   93.292550
1638                 5.156190e+05      0.037555   86.768384
590                  1.028469e+06      0.074908   75.397871
...                           ...           ...         ...
719                  1.266909e+06      0.092274   70.608165
637                  1.059304e+06      0.077153   74.562033
1587                 6.307200e+04      0.004594   98.756650
982                  1.507064e+05      0.010977   96.094213
774                  3.578908e+06      0.260667   38.242188

[1862 rows x 3 columns]
1126    2
966     0
1196    1
1638    1
590     0
       ..
719     0
637     0
1587    2
982     2
774     0
Name: class_next, Length: 1862, dtype: object
Average number of nodes 3
Average maximum depth 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


              precision    recall  f1-score   support

           A       0.00      0.00      0.00       126
           B       0.54      0.99      0.70       148
           C       1.00      1.00      1.00       285

    accuracy                           0.77       559
   macro avg       0.51      0.66      0.57       559
weighted avg       0.65      0.77      0.69       559



[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Set random seed to ensure reproducible runs
RSEED = 50
FILEPATH = '../ExternalFiles/xlsx/DataSet_w_NA.xlsx'
check_path = os.path.isfile(FILEPATH)

df = None
if check_path:
    df = pd.read_excel(FILEPATH, sheet_name="Испорченные факты")
    
df = df.dropna()
df =pd.pivot_table(df, values=['Продажи, руб', 'Продажи, шт','Повторение заказа','Маржинальная прибыль','Повторение товара'], index=["Факты.Товар ID"],
                     aggfunc={'Продажи, шт': [np.median, np.sum],
                              'Продажи, руб': np.sum,
                              'Повторение заказа': np.sum,
                              'Маржинальная прибыль': np.sum
                        
                               })

newname=df.columns.map('_'.join)
df.columns=newname
df=df.reset_index()
# print(df)

# --------------
total_sale=df['Продажи, руб_sum'].sum()
df['Доля']=df['Продажи, руб_sum']/total_sale * 100
df = df.sort_values(by=('Продажи, руб_sum'), ascending=False)
df=df.assign(sum_d=df['Доля'].cumsum())
df.loc[(df['sum_d'] <= 80), 'ABC'] = 'A'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'ABC'] = 'B'
df.loc[(df['sum_d'] > 95), 'ABC'] = 'C'
# print(df)
# --------------
df['Стоимость, руб'] = df['Продажи, руб_sum']/df['Продажи, шт_sum']
df['Продажи в следующем периоде']= (df['Продажи, шт_sum'] + df['Продажи, шт_median']) *df['Стоимость, руб']
total_sale_next=df['Продажи в следующем периоде'].sum()
df['Доля_будущая']=df['Продажи в следующем периоде']/total_sale_next * 100
df = df.sort_values(by=('Продажи в следующем периоде'), ascending=False)
df=df.assign(sum_d_next=df['Доля_будущая'].cumsum())
df.loc[(df['sum_d_next'] <= 80), 'ABC_next'] = 'A'
df.loc[(df['sum_d_next'] > 80) & (df['sum_d_next'] <= 95), 'ABC_next'] = 'B'
df.loc[(df['sum_d_next'] > 95), 'ABC_next'] = 'C'

df.loc[(df['ABC'] != df['ABC_next']), 'Изменение класса'] = 1
df.loc[(df['ABC'] == df['ABC_next']), 'Изменение класса'] = 0

df.loc[(df['sum_d'] <= 80), 'class'] = '0'
df.loc[(df['sum_d'] > 80) & (df['sum_d'] <= 95), 'class'] = '1'
df.loc[(df['sum_d'] > 95), 'class'] = '2'
df.loc[(df['sum_d_next'] <= 80), 'class_next'] = '0'

df.loc[(df['sum_d_next'] > 80) & (df['sum_d_next'] <= 95), 'class_next'] = '1'
df.loc[(df['sum_d_next'] > 95), 'class_next'] = '2'

alldata = df[['Продажи в следующем периоде', 'Доля_будущая', 'sum_d_next', 'class_next']]
alldata = alldata.sample(frac=1)

allinput = alldata[ ['Продажи в следующем периоде', 'Доля_будущая', 'sum_d_next'] ]
alloutput = alldata["class_next"]
print(allinput)
print(alloutput)
train, test, train_labels, test_labels  = train_test_split(allinput, alloutput, 
                                                          stratify = alloutput,
                                                          test_size = 0.3, 
                                                          random_state = RSEED)
tree = DecisionTreeClassifier(max_depth = 1, random_state=RSEED)
tree.fit(X_train, Y_train)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')
print(f'Model Accuracy: {tree.score(X_train, Y_train)}')