# Импорт библиотек

In [159]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from pylab import rcParams
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import f1_score, r2_score


# Задача 1:
Проведите аналитику полученных данных
Подготовьте данные для регрессионной модели
Обучите регрессионную модель и осуществите прогноз на последний месяц для поля SALES
Оцените качество модели с помощью функции metric

# Метрика для оценки Вашей модели


In [160]:
def metric(y_true, y_pred):
    
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_pred) * 100


# Данные

In [161]:
df = pd.read_csv(r'data_regression_for_task.csv')

In [162]:
df

Unnamed: 0,YEAR,MONTH,CONTRAGENT,ARTICLE_CODE,ARTICLE_NAME,ARTICLE_GROUP,SALES,STORE_SALES
0,2017,4,ROYAL WINE CORP,100200,GAMLA CAB - 750ML,WINE,0.0,0.0
1,2017,4,SANTA MARGHERITA USA INC,100749,SANTA MARGHERITA P/GRIG ALTO - 375ML,WINE,0.0,0.0
2,2017,4,JIM BEAM BRANDS CO,10103,KNOB CREEK BOURBON 9YR - 100P - 375ML,LIQUOR,0.0,0.0
3,2017,4,HEAVEN HILL DISTILLERIES INC,10120,J W DANT BOURBON 100P - 1.75L,LIQUOR,0.0,0.0
4,2017,4,ROYAL WINE CORP,101664,RAMON CORDOVA RIOJA - 750ML,WINE,0.0,0.0
...,...,...,...,...,...,...,...,...
128350,2018,2,ANHEUSER BUSCH INC,9997,HOEGAARDEN 4/6NR - 12OZ,BEER,66460.0,212.0
128351,2018,2,COASTAL BREWING COMPANY LLC,99970,DOMINION OAK BARREL STOUT 4/6 NR - 12OZ,BEER,9080.0,35.0
128352,2018,2,BOSTON BEER CORPORATION,99988,SAM ADAMS COLD SNAP 1/6 KG,KEGS,0.0,32.0
128353,2018,2,,BC,BEER CREDIT,REF,0.0,-35.0


In [163]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,128355.0,2017.20603,0.404454,2017.0,2017.0,2017.0,2017.0,2018.0
MONTH,128355.0,7.079303,3.645826,1.0,5.0,8.0,10.0,12.0
SALES,128355.0,6563.037435,28924.944352,-6490.0,0.0,330.0,3250.0,1616600.0
STORE_SALES,128355.0,22.624213,239.693277,-4996.0,0.0,1.0,4.0,16271.75


## выявим нулевые значения

In [164]:
df.isnull().sum()

YEAR              0
MONTH             0
CONTRAGENT       24
ARTICLE_CODE      0
ARTICLE_NAME      0
ARTICLE_GROUP     1
SALES             0
STORE_SALES       0
dtype: int64

In [165]:
print('Для контрагентов ' + str( len( df[ pd.isnull( df['CONTRAGENT'] ) ] ) ))
print('Для группы товаров ' + str( len( df[ pd.isnull( df['ARTICLE_GROUP'] ) ] ) ))
print('Всего строк в наборе ' + str( len( df ) ))

Для контрагентов 24
Для группы товаров 1
Всего строк в наборе 128355


#### итого есть незначительное количество NaN только в контрагентах и всего 1 в группе товаров, 
#### заполним их неизвестными контрагентами

In [166]:
df.CONTRAGENT = df.CONTRAGENT.fillna('Unknown')
df.ARTICLE_GROUP = df.ARTICLE_GROUP.fillna('Unknown')

In [167]:
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH']].assign(DAY=1))

### артикулы и наименования товаров закодируем

In [168]:
code_columns = ['ARTICLE_CODE', 'ARTICLE_NAME',]
for col in code_columns:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes


In [169]:
df

Unnamed: 0,YEAR,MONTH,CONTRAGENT,ARTICLE_CODE,ARTICLE_NAME,ARTICLE_GROUP,SALES,STORE_SALES,DATE
0,2017,4,ROYAL WINE CORP,12,9933,WINE,0.0,0.0,2017-04-01
1,2017,4,SANTA MARGHERITA USA INC,18,18930,WINE,0.0,0.0,2017-04-01
2,2017,4,JIM BEAM BRANDS CO,26,12631,LIQUOR,0.0,0.0,2017-04-01
3,2017,4,HEAVEN HILL DISTILLERIES INC,29,11751,LIQUOR,0.0,0.0,2017-04-01
4,2017,4,ROYAL WINE CORP,43,17740,WINE,0.0,0.0,2017-04-01
...,...,...,...,...,...,...,...,...,...
128350,2018,2,ANHEUSER BUSCH INC,23550,11332,BEER,66460.0,212.0,2018-02-01
128351,2018,2,COASTAL BREWING COMPANY LLC,23551,7729,BEER,9080.0,35.0,2018-02-01
128352,2018,2,BOSTON BEER CORPORATION,23552,18716,KEGS,0.0,32.0,2018-02-01
128353,2018,2,Unknown,23554,2283,REF,0.0,-35.0,2018-02-01


In [170]:
y = df['SALES']
cat_feat = df[['CONTRAGENT', 'ARTICLE_GROUP', 'DATE']].apply(LabelEncoder().fit_transform)
X = pd.get_dummies(df.drop(['SALES'], axis=1),
                   columns=cat_feat.columns)

In [171]:
#Лес или бустинг (стандарт)
model_cat = RandomForestRegressor(
#     n_estimators=100, learning_rate=0.2, max_depth=4, 
#                                   silent=True
)
model_cat.fit(X_train, y_train)
y_pred_test = model_cat.predict(X_test)
y_pred_train = model_cat.predict(X_train)
print(metric(y_train, y_pred_train), metric(y_test, y_pred_test)  )



17.762507534906742 45.08156930962925


### с использованием перекрестной проверки результат заметно выше:

In [158]:
skf = KFold(n_splits=10, random_state=None, shuffle=True)
train_metric, test_metric = [], []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    cat_model = CatBoostRegressor(n_estimators=500, learning_rate=0.3, max_depth=4,silent=True)#n_estimators=1000, learning_rate=0.2, max_depth=4, silent=True)
    cat_model.fit(X_train, y_train)
    
    y_pred_train, y_pred_test = cat_model.predict(X_train), cat_model.predict(X_test)
    metric(y_train, y_pred_train), metric(y_test, y_pred_test)
    train_metric.append(metric(y_train, y_pred_train))
    test_metric.append(metric(y_test, y_pred_test))
    print(metric(y_train, y_pred_train), metric(y_test, y_pred_test))
print(sum(train_metric)/len(train_metric))
print(sum(test_metric)/len(test_metric))

69.90521734141726 74.33381986945055
69.9148978605232 72.63389325061807
69.39436874010616 73.91795353248293
69.1681279538116 75.9063198934958
69.4506358629156 74.77434072428252
69.42632299283001 72.69555773252011
69.33303386302579 74.50336343007852
69.68970700878087 75.47712273897672
70.04091351130421 74.51079182336423
68.93529877768853 75.91183417780168
69.52585239124032
74.46649971730712


# Задача 2:
Проведите аналитику полученных данных
Подготовьте данные для модели классификации
Обучите модель классификации и осуществите прогноз для отложенной выборки с учетомо перекоса в данных TARGET
Оцените качество модели с помощью функции f1_score

# Данные

In [172]:
df = pd.read_csv(r'data_classification_for_task.csv')

In [173]:
df.head()

Unnamed: 0,AGE,GENDER,FEATURE_1,FEATURE_2,FEATURE_3,FEATURE_4,FEATURE_5,FEATURE_6,FEATURE_7,FEATURE_8,FEATURE_9,FEATURE_10,FEATURE_11,TARGET
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [174]:
df.TARGET.unique()

array([1, 0])

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   AGE         303 non-null    int64  
 1   GENDER      303 non-null    int64  
 2   FEATURE_1   303 non-null    int64  
 3   FEATURE_2   303 non-null    int64  
 4   FEATURE_3   303 non-null    int64  
 5   FEATURE_4   303 non-null    int64  
 6   FEATURE_5   303 non-null    int64  
 7   FEATURE_6   303 non-null    int64  
 8   FEATURE_7   303 non-null    int64  
 9   FEATURE_8   303 non-null    float64
 10  FEATURE_9   303 non-null    int64  
 11  FEATURE_10  303 non-null    int64  
 12  FEATURE_11  303 non-null    int64  
 13  TARGET      303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [176]:
df.isnull().sum()

AGE           0
GENDER        0
FEATURE_1     0
FEATURE_2     0
FEATURE_3     0
FEATURE_4     0
FEATURE_5     0
FEATURE_6     0
FEATURE_7     0
FEATURE_8     0
FEATURE_9     0
FEATURE_10    0
FEATURE_11    0
TARGET        0
dtype: int64

In [177]:
y = df['TARGET']
X = df.drop(['TARGET'], axis=1)

In [181]:
skf = KFold(n_splits=3, random_state=None, shuffle=True)
train_metric, test_metric = [], []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    cat_model = CatBoostClassifier(n_estimators=500, learning_rate=0.3, max_depth=4, silent=True)
    cat_model.fit(X_train, y_train)
    
    y_pred_train, y_pred_test = cat_model.predict(X_train), cat_model.predict(X_test)
    f1_score(y_train, y_pred_train), f1_score(y_test, y_pred_test)
    train_metric.append(f1_score(y_train, y_pred_train))
    test_metric.append(f1_score(y_test, y_pred_test))
    print(f1_score(y_train, y_pred_train), f1_score(y_test, y_pred_test))
print(sum(train_metric)/len(train_metric))
print(sum(test_metric)/len(test_metric))

1.0 0.847457627118644
1.0 0.8181818181818181
1.0 0.8521739130434782
1.0
0.8392711194479802
