In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, roc_auc_score)

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import timeit
import warnings
warnings.simplefilter(action='ignore')

In [2]:
CO_processed_path = '/home/ubuntu/workspace_rohan/project/data/raw/CO_processed.csv'
CO_timeseries_path = '/home/ubuntu/workspace_rohan/project/data/raw/CO_timeseries.csv'

In [3]:
data = pd.read_csv(CO_processed_path)
data = data.drop(["Unnamed: 0"],axis=1)

In [4]:
data_tseries = pd.read_csv(CO_timeseries_path)
data_tseries = data_tseries.drop(["Unnamed: 0"],axis=1)

In [5]:
data.shape

(2883584, 31)

In [6]:
data.head()

Unnamed: 0,yr_month,fecha_de_visita,visit_num,codigo_de_cliente,codigo_de_producto,cod_canal,cod_giro,cod_subgiro,desc_region,desc_subregion,...,product_sales_amount_last_3m,product_trnx_last_3m,normalized_rotation,normalized_freq,total_sales_last_3m,total_trnx_last_3m,ratio_sales_last_3m,ratio_trnx_last_3m,bought_last_year_flag,prod_coverage_bucket
0,2021-02,2021-02-22,5,94156234,598795,2.0,140.0,141.0,0,2,...,68385.0,5,0.214286,0.555556,1656929.4,12.0,0.041272,0.416667,0,1
1,2021-02,2021-02-25,3,9431428,509581,6.0,810.0,808.0,0,2,...,0.0,0,,,762578.61,9.0,0.0,0.0,0,1
2,2021-02,2021-02-25,8,9431812,514444,2.0,140.0,142.0,0,2,...,0.0,0,,,996592.85,20.0,0.0,0.0,0,0
3,2021-02,2021-02-25,4,9440422,599045,2.0,150.0,151.0,0,2,...,0.0,0,,,891713.4,14.0,0.0,0.0,0,0
4,2021-02,2021-02-25,4,9455676,508403,2.0,150.0,151.0,0,2,...,0.0,0,,,2980016.26,18.0,0.0,0.0,0,3


In [7]:
data_tseries.shape

(3780431, 23)

In [8]:
data_tseries.head()

Unnamed: 0,fecha_de_visita,codigo_de_cliente,codigo_de_producto,week_1,week_2,week_3,week_4,week_5,week_6,week_7,...,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_19,week_20
0,2020-12-04,94100001,508588,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-12-04,94100001,598913,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2020-12-04,94100001,509581,0,0,2,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-12-04,94100001,509641,0,0,0,0,0,2,0,...,2,0,0,0,0,0,1,0,0,2
4,2020-12-04,94100001,598846,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0


In [9]:
data_tseries = pd.merge(data_tseries, data, how='left', left_on=['fecha_de_visita', 'codigo_de_cliente', 'codigo_de_producto'], right_on=['fecha_de_visita', 'codigo_de_cliente', 'codigo_de_producto'])

In [10]:
data_tseries.head(2)

Unnamed: 0,fecha_de_visita,codigo_de_cliente,codigo_de_producto,week_1,week_2,week_3,week_4,week_5,week_6,week_7,...,product_sales_amount_last_3m,product_trnx_last_3m,normalized_rotation,normalized_freq,total_sales_last_3m,total_trnx_last_3m,ratio_sales_last_3m,ratio_trnx_last_3m,bought_last_year_flag,prod_coverage_bucket
0,2020-12-04,94100001,508588,0,0,0,0,0,0,0,...,0.0,0.0,,,61400.0,7.0,0.0,0.0,0.0,5.0
1,2020-12-04,94100001,598913,0,0,0,0,0,0,0,...,,,,,,,,,,


In [11]:
data_tseries = data_tseries.dropna()

In [12]:
data_tseries.shape

(1330145, 51)

In [13]:
not_bought_count = data_tseries.bought_in_the_visit.value_counts()[0]
bought_count = data_tseries.bought_in_the_visit.value_counts()[1]

baseline = round(max(bought_count, not_bought_count)/ (bought_count + not_bought_count),2)
baseline, bought_count, not_bought_count

(0.67, 443065, 887080)

### Direct train/test (XGBoost)

In [None]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
model = XGBClassifier(use_label_encoder = False, n_estimators = 500)

In [None]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

model.fit(X_train, y_train)

print("Total training time is :", timeit.default_timer() - starttime)

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
predictions = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: %.2f%%" % (auc * 100.0))

In [None]:
mtx = confusion_matrix(y_test, predictions)
print(mtx)

In [None]:
crep = classification_report(y_test, predictions)
print(crep)

### Direct cross validation (XGBoost)

In [None]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [None]:
model = XGBClassifier(use_label_encoder = False, n_estimators = 500)

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7

In [None]:
kfold = KFold(n_splits=5, random_state=seed)

In [None]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

results = cross_val_score(model, X, y, cv=kfold)

print("Total training time is :", timeit.default_timer() - starttime)

In [None]:
print("Accuracy: %.2f%%" % (results.mean()*100))

In [None]:
results

In [None]:
model = XGBClassifier(use_label_encoder = False, n_estimators = 500)

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7

In [None]:
kfold = KFold(n_splits=5, random_state=seed)

In [None]:
results = cross_val_score(model, X, y, scoring='roc_auc',cv=kfold)

In [None]:
results

In [None]:
print("AUC: %.2f%%" % (results.mean()*100))

### Direct held out (last 2 months) (XGBoost)

In [None]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [None]:
data_tseries['fecha_de_visita_dt'] = pd.to_datetime(data_tseries['fecha_de_visita'], format="%Y-%m-%d")

In [None]:
train = data_tseries[data_tseries['fecha_de_visita_dt'] < pd.to_datetime('2021-03-31', format="%Y-%m-%d")]

In [None]:
test = data_tseries[data_tseries['fecha_de_visita_dt'] >= pd.to_datetime('2021-03-31', format="%Y-%m-%d")]

In [None]:
X_train = train[features]
X_test = test[features]
y_train = train["bought_in_the_visit"]
y_test = test["bought_in_the_visit"]

In [None]:
model = XGBClassifier(use_label_encoder = False, n_estimators = 500)

In [None]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

model.fit(X_train, y_train)

print("Total training time is :", timeit.default_timer() - starttime)

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
predictions = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: %.2f%%" % (auc * 100.0))

In [None]:
mtx = confusion_matrix(y_test, predictions)
print(mtx)

In [None]:
crep = classification_report(y_test, predictions)
print(crep)

### Direct train/test (Random Forest)

In [None]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
model = RandomForestClassifier()

In [None]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

model.fit(X_train, y_train)

print("Total training time is :", timeit.default_timer() - starttime)

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
predictions = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: %.2f%%" % (auc * 100.0))

In [None]:
mtx = confusion_matrix(y_test, predictions)
print(mtx)

In [None]:
crep = classification_report(y_test, predictions)
print(crep)

### Direct cross validation (Random Forest)

In [None]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [None]:
model = RandomForestClassifier()

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7

In [None]:
kfold = KFold(n_splits=5, random_state=seed)

In [None]:
results = cross_val_score(model, X, y, cv=kfold)

In [None]:
results

In [None]:
print("Accuracy: %.2f%%" % (results.mean()*100))

In [None]:
model = RandomForestClassifier()

In [None]:
X = data_tseries[features]
y = data_tseries["bought_in_the_visit"]

In [None]:
seed = 7

In [None]:
kfold = KFold(n_splits=5, random_state=seed)

In [None]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

results = cross_val_score(model, X, y, scoring='roc_auc',cv=kfold)

print("Total training time is :", timeit.default_timer() - starttime)

In [None]:
results

In [None]:
print("AUC: %.2f%%" % (results.mean()*100))

### Direct held out (last 2 months) (Random Forest)

In [14]:
features = ['week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8',
       'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14',
       'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'cod_canal', 'cod_giro', 'cod_subgiro',
       'desc_region', 'desc_subregion', 'desc_division', 'cod_zona', 'ruta',
       'cod_modulo', 'categoria', 'marca', 'desc_sabor', 'desc_tipoenvase',
       'desc_subfamilia', 'contenido',
       'product_sales_amount_last_3m', 'product_trnx_last_3m',
       'normalized_rotation', 'normalized_freq', 'total_sales_last_3m',
       'total_trnx_last_3m', 'ratio_sales_last_3m', 'ratio_trnx_last_3m',
       'bought_last_year_flag', 'prod_coverage_bucket']

In [15]:
data_tseries['fecha_de_visita_dt'] = pd.to_datetime(data_tseries['fecha_de_visita'], format="%Y-%m-%d")

In [16]:
max(data_tseries['fecha_de_visita_dt'])

Timestamp('2021-05-31 00:00:00')

In [17]:
train = data_tseries[data_tseries['fecha_de_visita_dt'] < pd.to_datetime('2021-03-31', format="%Y-%m-%d")]

In [18]:
test = data_tseries[data_tseries['fecha_de_visita_dt'] >= pd.to_datetime('2021-03-31', format="%Y-%m-%d")]

In [19]:
X_train = train[features]
X_test = test[features]
y_train = train["bought_in_the_visit"]
y_test = test["bought_in_the_visit"]

In [20]:
model = RandomForestClassifier()

In [21]:
starttime = timeit.default_timer()
print("The start time is :",starttime)

model.fit(X_train, y_train)

print("Total training time is :", timeit.default_timer() - starttime)

The start time is : 124884.033134024
Total training time is : 266.45971798800747


In [22]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [23]:
predictions = [round(value) for value in y_pred]

In [24]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 72.32%


In [25]:
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: %.2f%%" % (auc * 100.0))

AUC: 73.01%


In [26]:
mtx = confusion_matrix(y_test, predictions)
print(mtx)

[[275108  36162]
 [ 91516  58400]]


In [27]:
crep = classification_report(y_test, predictions)
print(crep)

              precision    recall  f1-score   support

         0.0       0.75      0.88      0.81    311270
         1.0       0.62      0.39      0.48    149916

    accuracy                           0.72    461186
   macro avg       0.68      0.64      0.64    461186
weighted avg       0.71      0.72      0.70    461186

